Theory Negative_Association_Bloom_Filters

section ‹Application: Bloom Filters›

text ‹The false positive probability of Bloom Filters is a case where negative association is
really useful. Traditionally it is derived only approximately. Bloom~\cite{bloom1970} first derives
the expected number of bits set to true given the number of elements inserted, then the false
positive probability is computed, pretending that the expected number of bits is the actual number
of bits.

Both Blooms original derivation and Mitzenmacher and Upfal~\cite{mitzenmacher2017} use this method.

A more correct approach would be to derive a tail bound for the number of set bits and derive
a false-positive probability based on that, which unfortunately leads to a complex formula.

An exact result has later been derived using combinatorial methods by Gopinathan and
Sergey~\cite{gopinathan2020}. However their formula is less useful, as it consists of a sum
with Stirling numbers and binomial coefficients.

It is however easy to see that the original bound derived by Bloom is a correct upper bound for the
false positive probability using negative association. (This is pointed out by
Bao et al.~\cite{bao2021}.)

In this section, we derive the same bound using this library as an example for the applicability of
this library.›

theory Negative_Association_Bloom_Filters
  imports Negative_Association_Permutation_Distributions
begin

fun bloom_filter_pmf where
  "bloom_filter_pmf 0 d N = return_pmf {}" |
  "bloom_filter_pmf (Suc n) d N = do {
      h ← bloom_filter_pmf n d N;
      a ← pmf_of_set {a. a ⊆ {..<(N::nat)} ∧ card a = d};
      return_pmf (a ∪ h)
    }"

lemma bloom_filter_neg_assoc:
  assumes "d ≤ N"
  shows "measure_pmf.neg_assoc (bloom_filter_pmf n d N) (λi ω. i ∈ ω) {..<N}"
proof (induction n)
  case 0

  have a:"measure_pmf.neg_assoc (bloom_filter_pmf 0 d N) (λ_ _. False) {..<N}"
    by (intro measure_pmf.indep_imp_neg_assoc measure_pmf.indep_vars_const) auto
  show ?case by (intro measure_pmf.neg_assoc_cong[OF _ _ a] AE_pmfI) simp_all
next
  case (Suc n)
  let ?l = "bloom_filter_pmf n d N"
  let ?r = "pmf_of_set {a. a ⊆ {..<N} ∧ card a = d}"

  define f where "f j ω = (ω (True,j) ∨ ω (False,j))" for ω and j :: nat

  have f_borel: "f i ∈ borel_measurable (Pi⇩_M (UNIV × {i}) (λ_. borel))" (is "?L ∈ ?R") for i
  proof -
    have "f i = (λω. max(fst ω) (snd ω)) ∘ (λω. (ω (True,i),ω (False,i)))" unfolding f_def by auto
    also have "… ∈ ?R" by (intro measurable_comp[where N="borel ⨂⇩_M borel"]) measurable
    finally show ?thesis by simp
  qed

  have 0:"{True} ×{..<N} ∪ {False} ×{..<N} = UNIV×{..<N}" by auto

  have s:"{b} × {..<N} = Pair b ` {..<N}" for b :: bool by auto

  have "measure_pmf.neg_assoc (map_pmf snd (pair_pmf ?l ?r)) (λi ω. i ∈ ω) ({..<N})"
    unfolding map_snd_pair_pmf using assms by (intro n_subsets_distribution_neg_assoc) auto
  hence na_l:
    "measure_pmf.neg_assoc (pair_pmf ?l ?r) (λi ω. snd i ∈ case_bool fst snd (fst i) ω) ({False} × {..<N})"
    unfolding s neg_assoc_map_pmf by (subst measure_pmf.neg_assoc_reindex) (auto intro:inj_onI)

  have "measure_pmf.neg_assoc (map_pmf fst (pair_pmf ?l ?r)) (∈) ({..<N})"
    unfolding map_fst_pair_pmf using Suc by simp
  hence na_r:
    "measure_pmf.neg_assoc (pair_pmf ?l ?r) (λi ω. snd i ∈ case_bool fst snd (fst i) ω) ({True} × {..<N})"
    unfolding s neg_assoc_map_pmf by (subst measure_pmf.neg_assoc_reindex) (auto intro:inj_onI)

  have c: "prob_space.indep_var (pair_pmf ?l ?r)
     (PiM ({True} × {..<N}) (λ_. borel)) x (PiM ({False} × {..<N}) (λ_. borel)) y"
    if "x = ((λω. λi∈{True} × {..<N}. snd i∈ ω)∘fst)" "y=((λω. λi∈{False} × {..<N}. snd i ∈ ω)∘snd)"
    for x y
    unfolding that by (intro prob_space.indep_var_compose[OF _ indep_var_pair_pmf] prob_space_measure_pmf)
      (auto simp:space_PiM)

  have a:"measure_pmf.neg_assoc (pair_pmf ?l ?r) (λi ω. snd i ∈ case_bool fst snd (fst i) ω) (UNIV × {..<N})"
    by (intro measure_pmf.neg_assoc_combine[OF _ 0] na_l na_r c) (auto simp: restrict_def mem_Times_iff)
  have "measure_pmf.neg_assoc (pair_pmf ?l ?r) (λi ω. f i (λi. snd i ∈ case_bool fst snd (fst i) ω)) {..<N}"
    by (intro measure_pmf.neg_assoc_compose[OF _ a, where deps="λj. UNIV×{j}" and η="Fwd"]
        monotoneI depends_onI f_borel) (auto simp:f_def)
  hence "measure_pmf.neg_assoc (pair_pmf ?l ?r) (λi ω. i ∈ fst ω ∨ i ∈ snd ω) {..<N}"
    unfolding f_def by (simp add:case_prod_beta')
  hence "measure_pmf.neg_assoc (map_pmf (case_prod (∪)) (pair_pmf ?l ?r)) (∈) {..<N}"
    unfolding neg_assoc_map_pmf by (simp add:case_prod_beta')
  thus ?case by (simp add:pair_pmf_def map_bind_pmf Un_commute)
qed

lemma bloom_filter_cell_prob:
  assumes "d ≤ N" "i < N"
  shows "measure (bloom_filter_pmf n d N) {ω. i ∈ ω} = 1 - (1 - real d/real N)^n"
proof -
  have "measure (bloom_filter_pmf n d N) {ω. i ∉ ω} = (1 - real d/real N)^n"
  proof (induction n)
    case 0 thus ?case by simp
  next
    case (Suc n)
    let ?p = "pair_pmf (bloom_filter_pmf n d N) (pmf_of_set {a. a ⊆ {..<N} ∧ card a = d})"

    have a:" {ω. i ∉ fst ω ∧ i ∉ snd ω} = ({ω. i ∉ ω}) × ({ω. i ∉ ω})" by auto

    have "measure ?p {ω. i ∉ fst ω ∧ i ∉ snd ω} = (1-real d/N) ^ n *  (1-real d/card {..<N})"
      using assms unfolding a measure_pair_pmf
      by (intro Suc n_subsets_prob(1) arg_cong2[where f="(*)"]) auto
    also have "… =  (1-real d/N) ^ (n+1)" by simp
    finally have "measure ?p {ω. i ∉ fst ω ∧ i ∉ snd ω} = (1-real d/N) ^ (n+1)" by simp

    hence "measure (map_pmf (λω. snd ω ∪ fst ω) ?p) {ω. i ∉ ω} =  (1-real d/N)^(n+1)"
      by (simp add:disj_commute)
    thus ?case by (simp add:pair_pmf_def map_bind_pmf)
  qed
  hence "1 - measure (bloom_filter_pmf n d N) {ω. i ∈ ω} = (1 - real d/real N)^n"
    by (subst measure_pmf.prob_compl[symmetric]) (auto simp:set_diff_eq)
  thus ?thesis by simp
qed

lemma bloom_filter_false_positive_prob:
  assumes "d ≤ N" "T ⊆ {..<N}" "card T = d"
  shows "measure (bloom_filter_pmf n d N) {ω. T ⊆ ω} ≤ (1 - (1 - real d/real N)^n)^d"
    (is "?L ≤ ?R")
proof -
  let ?p = "bloom_filter_pmf n d N"
  have na: "measure_pmf.neg_assoc (bloom_filter_pmf n d N) (λi ω. i ∈ ω) T"
    by (intro measure_pmf.neg_assoc_subset[OF assms(2) bloom_filter_neg_assoc] assms(1))

  have fin_T: "finite T" using assms(2) finite_subset by auto
  hence a: "of_bool (T ⊆ y) = (∏t∈T. of_bool (t ∈ y)::real)" for y
    by (induction T) auto

  have "?L = measure ?p ({ω. T ⊆ ω} ∩ space ?p)" by simp
  also have "… = (∫ω. (∏t ∈ T. of_bool(t ∈ ω)) ∂?p)"
    unfolding Bochner_Integration.integral_indicator[symmetric] indicator_def
    using a by (intro integral_cong_AE AE_pmfI) auto
  also have "… ≤ (∏t ∈ T. (∫ω. of_bool(t ∈ ω) ∂?p))"
    by (intro has_int_thatD(2)[OF measure_pmf.neg_assoc_imp_prod_mono[OF _ na, where η="Fwd"]]
        integrable_bounded_pmf bounded_range_imp[OF bounded_of_bool] fin_T
        borel_measurable_continuous_onI) (auto intro:monoI)
  also have "… = (∏t ∈ T. measure ?p ({ω. t ∈ ω} ∩ space ?p))"
    unfolding Bochner_Integration.integral_indicator[symmetric] indicator_def by simp
  also have "… = (∏t ∈ T. measure ?p {ω. t ∈ ω})" by simp
  also have "… = (∏t ∈ T. 1 - (1 - real d/real N)^n)"
    using assms(1,2) by (intro prod.cong bloom_filter_cell_prob) auto
  also have "… = ?R" using assms(3) by simp
  finally show ?thesis by simp
qed

end