Theory Negative_Association_Bloom_Filters
section ‹Application: Bloom Filters›
text ‹The false positive probability of Bloom Filters is a case where negative association is
really useful. Traditionally it is derived only approximately. Bloom~\cite{bloom1970} first derives
the expected number of bits set to true given the number of elements inserted, then the false
positive probability is computed, pretending that the expected number of bits is the actual number
of bits.
Both Blooms original derivation and Mitzenmacher and Upfal~\cite{mitzenmacher2017} use this method.
A more correct approach would be to derive a tail bound for the number of set bits and derive
a false-positive probability based on that, which unfortunately leads to a complex formula.
An exact result has later been derived using combinatorial methods by Gopinathan and
Sergey~\cite{gopinathan2020}. However their formula is less useful, as it consists of a sum
with Stirling numbers and binomial coefficients.
It is however easy to see that the original bound derived by Bloom is a correct upper bound for the
false positive probability using negative association. (This is pointed out by
Bao et al.~\cite{bao2021}.)
In this section, we derive the same bound using this library as an example for the applicability of
this library.›
theory Negative_Association_Bloom_Filters
imports Negative_Association_Permutation_Distributions
begin
fun bloom_filter_pmf where
"bloom_filter_pmf 0 d N = return_pmf {}" |
"bloom_filter_pmf (Suc n) d N = do {
h ← bloom_filter_pmf n d N;
a ← pmf_of_set {a. a ⊆ {..<(N::nat)} ∧ card a = d};
return_pmf (a ∪ h)
}"
lemma bloom_filter_neg_assoc:
assumes "d ≤ N"
shows "measure_pmf.neg_assoc (bloom_filter_pmf n d N) (λi ω. i ∈ ω) {..<N}"
proof (induction n)
case 0
have a:"measure_pmf.neg_assoc (bloom_filter_pmf 0 d N) (λ_ _. False) {..<N}"
by (intro measure_pmf.indep_imp_neg_assoc measure_pmf.indep_vars_const) auto
show ?case by (intro measure_pmf.neg_assoc_cong[OF _ _ a] AE_pmfI) simp_all
next
case (Suc n)
let ?l = "bloom_filter_pmf n d N"
let ?r = "pmf_of_set {a. a ⊆ {..<N} ∧ card a = d}"
define f where "f j ω = (ω (True,j) ∨ ω (False,j))" for ω and j :: nat
have f_borel: "f i ∈ borel_measurable (Pi⇩M (UNIV × {i}) (λ_. borel))" (is "?L ∈ ?R") for i
proof -
have "f i = (λω. max(fst ω) (snd ω)) ∘ (λω. (ω (True,i),ω (False,i)))" unfolding f_def by auto
also have "… ∈ ?R" by (intro measurable_comp[where N="borel ⨂⇩M borel"]) measurable
finally show ?thesis by simp
qed
have 0:"{True} ×{..<N} ∪ {False} ×{..<N} = UNIV×{..<N}" by auto
have s:"{b} × {..<N} = Pair b ` {..<N}" for b :: bool by auto
have "measure_pmf.neg_assoc (map_pmf snd (pair_pmf ?l ?r)) (λi ω. i ∈ ω) ({..<N})"
unfolding map_snd_pair_pmf using assms by (intro n_subsets_distribution_neg_assoc) auto
hence na_l:
"measure_pmf.neg_assoc (pair_pmf ?l ?r) (λi ω. snd i ∈ case_bool fst snd (fst i) ω) ({False} × {..<N})"
unfolding s neg_assoc_map_pmf by (subst measure_pmf.neg_assoc_reindex) (auto intro:inj_onI)
have "measure_pmf.neg_assoc (map_pmf fst (pair_pmf ?l ?r)) (∈) ({..<N})"
unfolding map_fst_pair_pmf using Suc by simp
hence na_r:
"measure_pmf.neg_assoc (pair_pmf ?l ?r) (λi ω. snd i ∈ case_bool fst snd (fst i) ω) ({True} × {..<N})"
unfolding s neg_assoc_map_pmf by (subst measure_pmf.neg_assoc_reindex) (auto intro:inj_onI)
have c: "prob_space.indep_var (pair_pmf ?l ?r)
(PiM ({True} × {..<N}) (λ_. borel)) x (PiM ({False} × {..<N}) (λ_. borel)) y"
if "x = ((λω. λi∈{True} × {..<N}. snd i∈ ω)∘fst)" "y=((λω. λi∈{False} × {..<N}. snd i ∈ ω)∘snd)"
for x y
unfolding that by (intro prob_space.indep_var_compose[OF _ indep_var_pair_pmf] prob_space_measure_pmf)
(auto simp:space_PiM)
have a:"measure_pmf.neg_assoc (pair_pmf ?l ?r) (λi ω. snd i ∈ case_bool fst snd (fst i) ω) (UNIV × {..<N})"
by (intro measure_pmf.neg_assoc_combine[OF _ 0] na_l na_r c) (auto simp: restrict_def mem_Times_iff)
have "measure_pmf.neg_assoc (pair_pmf ?l ?r) (λi ω. f i (λi. snd i ∈ case_bool fst snd (fst i) ω)) {..<N}"
by (intro measure_pmf.neg_assoc_compose[OF _ a, where deps="λj. UNIV×{j}" and η="Fwd"]
monotoneI depends_onI f_borel) (auto simp:f_def)
hence "measure_pmf.neg_assoc (pair_pmf ?l ?r) (λi ω. i ∈ fst ω ∨ i ∈ snd ω) {..<N}"
unfolding f_def by (simp add:case_prod_beta')
hence "measure_pmf.neg_assoc (map_pmf (case_prod (∪)) (pair_pmf ?l ?r)) (∈) {..<N}"
unfolding neg_assoc_map_pmf by (simp add:case_prod_beta')
thus ?case by (simp add:pair_pmf_def map_bind_pmf Un_commute)
qed
lemma bloom_filter_cell_prob:
assumes "d ≤ N" "i < N"
shows "measure (bloom_filter_pmf n d N) {ω. i ∈ ω} = 1 - (1 - real d/real N)^n"
proof -
have "measure (bloom_filter_pmf n d N) {ω. i ∉ ω} = (1 - real d/real N)^n"
proof (induction n)
case 0 thus ?case by simp
next
case (Suc n)
let ?p = "pair_pmf (bloom_filter_pmf n d N) (pmf_of_set {a. a ⊆ {..<N} ∧ card a = d})"
have a:" {ω. i ∉ fst ω ∧ i ∉ snd ω} = ({ω. i ∉ ω}) × ({ω. i ∉ ω})" by auto
have "measure ?p {ω. i ∉ fst ω ∧ i ∉ snd ω} = (1-real d/N) ^ n * (1-real d/card {..<N})"
using assms unfolding a measure_pair_pmf
by (intro Suc n_subsets_prob(1) arg_cong2[where f="(*)"]) auto
also have "… = (1-real d/N) ^ (n+1)" by simp
finally have "measure ?p {ω. i ∉ fst ω ∧ i ∉ snd ω} = (1-real d/N) ^ (n+1)" by simp
hence "measure (map_pmf (λω. snd ω ∪ fst ω) ?p) {ω. i ∉ ω} = (1-real d/N)^(n+1)"
by (simp add:disj_commute)
thus ?case by (simp add:pair_pmf_def map_bind_pmf)
qed
hence "1 - measure (bloom_filter_pmf n d N) {ω. i ∈ ω} = (1 - real d/real N)^n"
by (subst measure_pmf.prob_compl[symmetric]) (auto simp:set_diff_eq)
thus ?thesis by simp
qed
lemma bloom_filter_false_positive_prob:
assumes "d ≤ N" "T ⊆ {..<N}" "card T = d"
shows "measure (bloom_filter_pmf n d N) {ω. T ⊆ ω} ≤ (1 - (1 - real d/real N)^n)^d"
(is "?L ≤ ?R")
proof -
let ?p = "bloom_filter_pmf n d N"
have na: "measure_pmf.neg_assoc (bloom_filter_pmf n d N) (λi ω. i ∈ ω) T"
by (intro measure_pmf.neg_assoc_subset[OF assms(2) bloom_filter_neg_assoc] assms(1))
have fin_T: "finite T" using assms(2) finite_subset by auto
hence a: "of_bool (T ⊆ y) = (∏t∈T. of_bool (t ∈ y)::real)" for y
by (induction T) auto
have "?L = measure ?p ({ω. T ⊆ ω} ∩ space ?p)" by simp
also have "… = (∫ω. (∏t ∈ T. of_bool(t ∈ ω)) ∂?p)"
unfolding Bochner_Integration.integral_indicator[symmetric] indicator_def
using a by (intro integral_cong_AE AE_pmfI) auto
also have "… ≤ (∏t ∈ T. (∫ω. of_bool(t ∈ ω) ∂?p))"
by (intro has_int_thatD(2)[OF measure_pmf.neg_assoc_imp_prod_mono[OF _ na, where η="Fwd"]]
integrable_bounded_pmf bounded_range_imp[OF bounded_of_bool] fin_T
borel_measurable_continuous_onI) (auto intro:monoI)
also have "… = (∏t ∈ T. measure ?p ({ω. t ∈ ω} ∩ space ?p))"
unfolding Bochner_Integration.integral_indicator[symmetric] indicator_def by simp
also have "… = (∏t ∈ T. measure ?p {ω. t ∈ ω})" by simp
also have "… = (∏t ∈ T. 1 - (1 - real d/real N)^n)"
using assms(1,2) by (intro prod.cong bloom_filter_cell_prob) auto
also have "… = ?R" using assms(3) by simp
finally show ?thesis by simp
qed
end