Theory Tutorial_Pseudorandom_Objects

section ‹Tutorial on the use of Pseudorandom-Objects›

theory Tutorial_Pseudorandom_Objects
  imports
    Universal_Hash_Families.Pseudorandom_Objects_Hash_Families
    Expander_Graphs.Pseudorandom_Objects_Expander_Walks
    Equivalence_Relation_Enumeration.Equivalence_Relation_Enumeration
    Median_Method.Median
    Concentration_Inequalities.Bienaymes_Identity
    Frequency_Moments.Frequency_Moments
begin

text ‹This section is a tutorial for the use of pseudorandom objects. Starting from the
approximation algorithm for the second frequency moment by Alon et al.~\cite{alon1999}, we will
improve the solution until we achieve a space complexity of
$\mathcal O(\ln n + \varepsilon^{-2} \ln(\delta^{-1}) \ln m)$, where $n$ denotes the range of the
stream elements, $m$ denotes the length of the stream, $\varepsilon$ denotes the desired accuracy
and $\delta$ denotes the desired failure probability.

The construction relies on a combination of pseudorandom object, in particular an expander walk
and two chained hash families.›

hide_const (open) topological_space_class.discrete
hide_const (open) Abstract_Rewriting.restrict
hide_fact (open) Abstract_Rewriting.restrict_def
hide_fact (open) Henstock_Kurzweil_Integration.integral_cong
hide_fact (open) Henstock_Kurzweil_Integration.integral_mult_right
hide_fact (open) Henstock_Kurzweil_Integration.integral_diff

text ‹The following lemmas show a one-side and two-sided Chernoff-bound for $\{0,1\}$-valued
independent identically distributed random variables. This to show the similarity with expander
walks, for which similar bounds can be established:
@{thm [source] expander_chernoff_bound_one_sided} and @{thm [source] expander_chernoff_bound}.›

lemma classic_chernoff_bound_one_sided:
  fixes l :: nat
  assumes "AE x in measure_pmf p. f x ∈ {0,1::real}"
  assumes "(∫x. f x ∂p) ≤ μ" "l > 0" "γ ≥ 0"
  shows "measure (prod_pmf {0..<l} (λ_. p)) {w. (∑i<l. f (w i))/l-μ≥γ} ≤ exp (- 2 * real l * γ^2)"
    (is "?L ≤ ?R")
proof -
  define ν  where "ν = real l*(∫x. f x ∂p)"
  let ?p = "prod_pmf {0..<l} (λ_. p)"

  have 1: "prob_space.indep_vars (measure_pmf ?p) (λ_. borel) (λi x. f (x i)) {0..<l}"
    by (intro prob_space.indep_vars_compose2[OF _ indep_vars_Pi_pmf] prob_space_measure_pmf) auto

  have "f (y i) ∈ {0..1}" if "y ∈ {0..<l} →⇩_E set_pmf p" "i ∈ {0..<l}" for y i
  proof -
    have "y i ∈ set_pmf p" using that by auto
    thus ?thesis using assms(1) unfolding AE_measure_pmf_iff by auto
  qed
  hence 2: "AE x in measure_pmf ?p. f (x i) ∈ {0..1}"
    if "i ∈ {0..<l}" for i
    using that by (intro AE_pmfI) (auto simp: set_prod_pmf)

  have "(∑i=0..<l. (∫x. f (x i) ∂?p)) = (∑i<l. (∫x. f x ∂map_pmf (λx. x i) ?p))"
    by (auto simp:atLeast0LessThan)
  also have "... = (∑i<l. (∫x. f x ∂p))" by (subst Pi_pmf_component) auto
  also have "... = ν" unfolding ν_def by simp
  finally have 3: "(∑i=0..<l. (∫x. f (x i) ∂prod_pmf {0..<l} (λ_. p))) = ν" by simp

  have 4: "ν ≤ real l * μ" unfolding ν_def using assms(2) by (simp add: mult_le_cancel_left)

  interpret Hoeffding_ineq "measure_pmf ?p" "{0..<l}" "λi x. f (x i)" "(λ_. 0)" "(λ_. 1)" ν
    using 1 2 unfolding 3 by unfold_locales auto

  have "?L ≤ measure ?p {x. (∑i=0..<l. f (x i)) ≥ real l*μ + real l*γ}"
    using assms(3) by (intro pmf_mono) (auto simp:field_simps atLeast0LessThan)
  also have "... ≤ measure ?p {x ∈ space ?p. (∑i=0..<l. f (x i)) ≥ ν + real l*γ}"
    using 4 by (intro pmf_mono) auto
  also have "... ≤ exp (- 2 * (real l * γ)^2 / (∑i=0..<l. (1 - 0)⇧²))"
    using assms(3,4) by (intro Hoeffding_ineq_ge) auto
  also have "... = ?R" using assms(3) by (simp add:power2_eq_square)
  finally show ?thesis by simp
qed

lemma classic_chernoff_bound:
  assumes "AE x in measure_pmf p. f x ∈ {0,1::real}" "l > (0::nat)" "γ ≥ 0"
  defines "μ ≡ (∫x. f x ∂p)"
  shows "measure (prod_pmf {0..<l} (λ_. p)) {w. ¦(∑i<l. f (w i))/l-μ¦≥γ} ≤ 2*exp (-2*real l*γ^2)"
    (is "?L ≤ ?R")
proof -
  have [simp]:"integrable p f" using assms(1) unfolding AE_measure_pmf_iff
    by (intro integrable_bounded_pmf boundedI[where B="1"]) auto
  let ?w = "prod_pmf {0..<l} (λ_. p)"
  have "?L ≤ measure ?w {w. (∑i<l. f (w i))/l-μ≥γ} + measure ?w {w. (∑i<l. f (w i))/l-μ≤-(γ)}"
    by (intro pmf_add) auto
  also have "... ≤ exp (-2*real l*γ^2) + measure ?w {w. -((∑i<l. f (w i))/l-μ)≥γ}"
    using assms by (intro add_mono classic_chernoff_bound_one_sided) (auto simp:algebra_simps)
  also have "... ≤ exp (-2*real l*γ^2) + measure ?w {w. ((∑i<l. 1-f (w i))/l-(1-μ))≥γ}"
    using assms(2) by (auto simp: sum_subtractf field_simps)
  also have "... ≤ exp (-2*real l*γ^2) + exp (-2*real l*γ^2)"
    using assms by (intro add_mono classic_chernoff_bound_one_sided) auto
  also have "... = ?R" by simp
  finally show ?thesis by simp
qed

text ‹Definition of the second frequency moment of a stream.›

definition F2 :: "'a list ⇒ real" where
  "F2 xs = (∑ x ∈ set xs. (of_nat (count_list xs x)^2))"

lemma prime_power_ls: "is_prime_power (pro_size (ℒ [- 1, 1]))"
proof -
  have "is_prime_power ((2::nat)^1)" by (intro is_prime_powerI) auto
  thus "is_prime_power (pro_size (ℒ [- 1, 1]))" by (auto simp:list_pro_size numeral_eq_Suc)
qed

lemma prime_power_h2: "is_prime_power (pro_size (ℋ 4 n (ℒ [- 1, 1::real])))"
  by (intro hash_pro_size_prime_power prime_power_ls) auto

abbreviation Ψ where "Ψ ≡ pmf_of_set {-1,1::real}"

lemma f2_exp:
  assumes "finite (set_pmf p)"
  assumes "⋀I. I ⊆ {0..<n} ⟹ card I ≤ 4 ⟹ map_pmf (λx. (λi∈I. x i)) p = prod_pmf I (λ_. Ψ)"
  assumes "set xs ⊆ {0..<n::nat}"
  shows "(∫h. (∑x ← xs. h x)^2 ∂p) = F2 xs" (is "?L = ?R")
proof -
  let ?c = "(λx. real (count_list xs x))"

  have [simp]: "integrable (measure_pmf p) f" for f :: "_ ⇒ real"
    by (intro integrable_measure_pmf_finite assms)

  have 0:"(∫h. h x * h y ∂p) = of_bool (x = y)"
    (is "?L1 = ?R1") if "x ∈ set xs" "y ∈ set xs" for x y
  proof -
    have xy_lt_n: "x < n" "y < n"  using assms that by auto
    have card_xy: "card {x,y} ≤ 4" by (cases "x = y") auto

    have "?L1 = (∫h. (h x * h y) ∂map_pmf (λf. restrict f {x,y}) p)"
      by simp
    also have "... = (∫h. (h x * h y) ∂prod_pmf {x,y} (λ_. Ψ))"
      using xy_lt_n card_xy by (intro integral_cong assms(2) arg_cong[where f="measure_pmf"]) auto
    also have "... = of_bool (x = y)" (is "?L2 = ?R2")
    proof (cases "x = y")
      case True
      hence "?L2 = (∫h. (h x ^2) ∂prod_pmf {x} (λ_. pmf_of_set {-1,1}))"
        unfolding power2_eq_square by simp
      also have "... = (∫x. x^2 ∂pmf_of_set {-1,1})"
        unfolding Pi_pmf_singleton by simp
      also have "... = 1" by (subst integral_pmf_of_set) auto
      also have "... = ?R2" using True by simp
      finally show ?thesis by simp
    next
      case False
      hence "?L2 = (∫h. (∏i∈{x,y}. h i) ∂prod_pmf {x,y} (λ_. pmf_of_set {-1,1}))" by simp
      also have "... = (∏i∈{x,y}. (∫x. x ∂pmf_of_set {-1,1}))"
        by (intro expectation_prod_Pi_pmf integrable_measure_pmf_finite) auto
      also have "... = 0" using False by (subst integral_pmf_of_set) auto
      also have "... = ?R2" using False by simp
      finally show ?thesis by simp
    qed
    finally show ?thesis by simp
  qed

  have "?L = (∫h. (∑x ∈ set xs. real (count_list xs x) * h x)^2 ∂p)"
    unfolding sum_list_eval by simp
  also have "... = (∫h. (∑x ∈ set xs. (∑y ∈ set xs. (?c x * ?c y) * h x * h y)) ∂p)"
    unfolding power2_eq_square sum_distrib_left sum_distrib_right by (simp add:ac_simps)
  also have "... = (∑x ∈ set xs. (∑y ∈ set xs. (∫h. (?c x * ?c y) * h x * h y ∂p)))" by simp
  also have "... = (∑x ∈ set xs. (∑y ∈ set xs. ?c x * ?c y * (∫h. h x * h y ∂p)))"
    by (subst integral_mult_right[symmetric]) (simp_all add:ac_simps)
  also have "... = (∑x ∈ set xs. (∑y ∈ set xs. ?c x * ?c y * of_bool (x = y)))"
    by (intro sum.cong refl) (simp add: 0)
  also have "... = (∑x ∈ set xs. ?c x^2) "
    unfolding of_bool_def by (simp add:if_distrib if_distribR sum.If_cases power2_eq_square)
  also have "... = F2 xs" unfolding F2_def by simp
  finally show ?thesis by simp
qed

lemma f2_exp_sq:
  assumes "finite (set_pmf p)"
  assumes "⋀I. I ⊆ {0..<n} ⟹ card I ≤ 4 ⟹ map_pmf (λx. (λi∈I. x i)) p = prod_pmf I (λ_. Ψ)"
  assumes "set xs ⊆ {0..<n::nat}"
  shows "(∫h. ((∑x←xs. h x)^2)^2 ∂p) ≤ 3 * F2 xs^2" (is "?L ≤ ?R")
proof -
  let ?c = "(λx. real (count_list xs x))"

  have [simp]: "integrable (measure_pmf p) f" for f :: "_ ⇒ real"
    by (intro integrable_measure_pmf_finite assms)

  define S where "S = set xs"

  have a: "finite S" unfolding S_def by simp

  define Q :: "nat ⇒ nat ⇒ nat ⇒ nat ⇒ real"
    where "Q a b c d =
      of_bool(a=b∧c=d∧a≠c) + of_bool(a=c∧b=d∧a≠b) +
      of_bool(a=d∧b=c∧a≠b) + of_bool(a=b∧b=c∧c=d)" for a b c d

  have cases: "(∫h. h a*h b*h c*h d ∂p) = Q a b c d" (is "?L1 = ?R1")
    if "a ∈ S" "b∈S" "c ∈ S" "d ∈ S" for a b c d
  proof -
    have "card {a,b,c,d} = card (set [a,b,c,d])" by (intro arg_cong[where f="card"]) auto
    also have "... ≤ length [a,b,c,d]" by (intro card_length)
    finally have card: "card {a, b, c, d} ≤ 4" by simp

    have "?L1 = (∫h. h a*h b*h c*h d ∂map_pmf (λf. restrict f {a,b,c,d}) p)" by simp
    also have "... = (∫h. h a*h b*h c*h d ∂prod_pmf {a,b,c,d} (λ_. Ψ))" using that assms(3)
      by (intro integral_cong arg_cong[where f="measure_pmf"] assms(2) card) (auto simp:S_def)
    also have "... = (∫h. (∏i ←[a,b,c,d]. h i) ∂prod_pmf {a,b,c,d} (λ_. Ψ))" by (simp add:ac_simps)
    also have "... = (∫h. (∏i ∈{a,b,c,d}. h i^count_list [a,b,c,d] i) ∂prod_pmf {a,b,c,d} (λ_. Ψ))"
      by (subst prod_list_eval) auto
    also have "... = (∏i ∈ {a,b,c,d}. (∫x. x^count_list [a,b,c,d] i ∂Ψ))"
      by (intro expectation_prod_Pi_pmf integrable_measure_pmf_finite) auto
    also have "... = (∏i ∈ {a,b,c,d}. of_bool (even (count_list [a,b,c,d] i)))"
      by (intro prod.cong refl) (auto simp:integral_pmf_of_set)
    also have "... = (∏i ∈ set (remdups [a,b,c,d]). of_bool (even (count_list [a,b,c,d] i)))"
      by (intro prod.cong refl) auto
    also have "... = (∏i ← remdups [a,b,c,d]. of_bool (even (count_list [a,b,c,d] i)))"
      by (intro prod.distinct_set_conv_list) auto
    also have "... = Q a b c d" unfolding Q_def by simp
    finally show ?thesis by simp
  qed

  have "?L = (∫h. (∑x ∈ S. real (count_list xs x) * h x)^4 ∂p)"
    unfolding S_def sum_list_eval by simp
  also have "... = (∫h. (∑a∈S. (∑b∈S.(∑c∈S.(∑d∈S.(?c a*?c b*?c c*?c d)*h a*h b*h c*h d)))) ∂p)"
    unfolding power4_eq_xxxx sum_distrib_left sum_distrib_right by (simp add:ac_simps)
  also have "... = (∑a∈S.(∑b∈S.(∑c∈S.(∑d∈S.(∫h. (?c a*?c b*?c c*?c d)*h a*h b*h c*h d ∂p)))))"
    by simp
  also have "... = (∑a∈S.(∑b∈S.(∑c∈S.(∑d∈S. (?c a*?c b*?c c*?c d)*(∫h. h a*h b*h c*h d ∂p)))))"
    by (subst integral_mult_right[symmetric]) (simp_all add:ac_simps)
  also have "... = (∑a∈S.(∑b∈S.(∑c∈S.(∑d∈S. (?c a*?c b*?c c*?c d)*(Q a b c d)))))"
    by (intro sum.cong refl) (simp add:cases)
  also have "... = 1*(∑a∈S. ?c a^4) + 3*(∑a∈S. (∑b∈S. ?c a^2 * ?c b^2 * of_bool(a≠b)))"
    unfolding Q_def
    by (simp add: sum.distrib distrib_left sum_collapse[OF a] ac_simps sum_distrib_left[symmetric]
        power2_eq_square power4_eq_xxxx)
  also have "... ≤ 3*(∑a∈S. ?c a^4) + 3*(∑a∈S. (∑b∈S. ?c a^2 * ?c b^2 * of_bool(a≠b)))"
    by (intro add_mono mult_right_mono sum_nonneg) auto
  also have "... = 3*(∑a∈S. (∑b∈S. ?c a^2 * ?c b^2 * (of_bool (a=b) + of_bool(a≠b))))"
    using a by (simp add: sum.distrib distrib_left)
  also have "... = 3*(∑a∈S. (∑b∈S. ?c a^2 * ?c b^2 * 1))"
    by (intro sum.cong arg_cong2[where f="(*)"] refl) auto
  also have "... = 3 * F2 xs^2" unfolding F2_def power2_eq_square
    by (simp add: S_def sum_distrib_left sum_distrib_right ac_simps)
  finally show "?L ≤ 3 * F2 xs^2" by simp
qed

lemma f2_var:
  assumes "finite (set_pmf p)"
  assumes "⋀I. I ⊆ {0..<n} ⟹ card I ≤ 4 ⟹ map_pmf (λx. (λi∈I. x i)) p = prod_pmf I (λ_. Ψ)"
  assumes "set xs ⊆ {0..<n::nat}"
  shows "measure_pmf.variance p (λh. (∑x←xs. h x)^2) ≤ 2* F2 xs^2"
    (is "?L ≤ ?R")
proof -
  have [simp]: "integrable (measure_pmf p) f" for f :: "_ ⇒ real"
    by (intro integrable_measure_pmf_finite assms)

  have "?L = (∫h. ((∑x←xs. h x)^2)^2 ∂p) - F2 xs^2"
    by (subst measure_pmf.variance_eq) (simp_all add:f2_exp[OF assms(1-3)])
  also have "... ≤ 3 * F2 xs^2 - F2 xs^2"
    by (intro diff_mono f2_exp_sq[OF assms]) auto
  finally show ?thesis by simp
qed

lemma
  assumes "s ∈ set_pmf (ℋ⇩_P 4 n (ℒ [-1,1]))"
  assumes "set xs ⊆ {0..<n}"
  shows f2_exp_hp: "(∫h. (∑x ← xs. h x)^2 ∂sample_pro s) = F2 xs" (is "?T1")
    and f2_exp_sq_hp: "(∫h. ((∑x ← xs. h x)^2)^2 ∂sample_pro s) ≤ 3* F2 xs^2" (is "?T2")
    and f2_var_hp: "measure_pmf.variance s (λh. (∑x ← xs. h x)^2) ≤ 2* F2 xs^2"  (is "?T3")
proof -
  have 0:"map_pmf (λx. restrict x I) (sample_pro s) = prod_pmf I (λ_. Ψ)" (is "?L = _")
    if "I ⊆ {0..<n}" "card I ≤ 4" for I
  proof -
    have "?L = prod_pmf I (λ_. sample_pro (ℒ [- 1, 1]))"
      using that by (intro hash_pro_pmf_distr[OF _ assms(1)] prime_power_ls) auto
    also have "... = prod_pmf I (λ_. Ψ)" by (subst list_pro_2) auto
    finally show ?thesis by simp
  qed

  show ?T1 by (intro f2_exp[OF _ _ assms(2)] finite_pro_set 0) simp
  show ?T2 by (intro f2_exp_sq[OF _ _ assms(2)] finite_pro_set 0) simp
  show ?T3 by (intro f2_var[OF _ _ assms(2)] finite_pro_set 0) simp
qed

lemmas f2_exp_h = f2_exp_hp[OF hash_pro_in_hash_pro_pmf[OF prime_power_ls]]
lemmas f2_var_h = f2_var_hp[OF hash_pro_in_hash_pro_pmf[OF prime_power_ls]]

lemma F2_definite:
  assumes "xs ≠ []"
  shows "F2 xs > 0"
proof -
  have "0 < real (card (set xs))" using assms by (simp add: card_gt_0_iff)
  also have "... = (∑x ∈ set xs. 1)" by simp
  also have "... ≤ F2 xs" using count_list_gr_1 unfolding F2_def by (intro sum_mono) force
  finally show ?thesis by simp
qed

text ‹The following algorithm uses a completely random function, accordingly it requires
a lot of space: $\mathcal O(n + \ln m)$.›

fun example_1 :: "nat ⇒ nat list ⇒ real pmf"
  where "example_1 n xs =
    do {
      h ← prod_pmf {0..<n} (λ_. pmf_of_set {-1,1::real});
      return_pmf ((∑x ← xs. h x)^2)
    }"

lemma example_1_correct:
  assumes "set xs ⊆ {0..<n}"
  shows
    "measure_pmf.expectation (example_1 n xs) id = F2 xs" (is "?L1 = ?R1")
    "measure_pmf.variance (example_1 n xs) id ≤ 2 * F2 xs^2" (is "?L2 ≤ ?R2")
proof -
  have "?L1 = (∫h. (∑x ← xs. h x)^2 ∂prod_pmf {0..<n} (λ_. Ψ))"
    by (simp add:map_pmf_def[symmetric])
  also have "... = ?R1" using assms by (intro f2_exp)
      (auto intro: Pi_pmf_subset[symmetric] simp add:restrict_def set_Pi_pmf)
  finally show "?L1 = ?R1" by simp

  have "?L2 = measure_pmf.variance (prod_pmf {0..<n} (λ_. Ψ)) (λh. (∑x ← xs. h x)^2)"
    by (simp add:map_pmf_def[symmetric] atLeast0LessThan)
  also have "... ≤ ?R2"
    using assms by (intro f2_var)
      (auto intro: Pi_pmf_subset[symmetric] simp add:restrict_def set_Pi_pmf)
  finally show "?L2 ≤ ?R2" by simp
qed

text ‹This version replaces a the use of completely random function with a pseudorandom object,
it requires a lot less space: $\mathcal O(\ln n + \ln m)$.›

fun example_2 :: "nat ⇒ nat list ⇒ real pmf"
  where "example_2 n xs =
    do {
      h ← sample_pro (ℋ 4 n (ℒ [-1,1]));
      return_pmf ((∑x ← xs. h x)^2)
    }"

lemma example_2_correct:
  assumes "set xs ⊆ {0..<n}"
  shows
    "measure_pmf.expectation (example_2 n xs) id = F2 xs" (is "?L1 = ?R1")
    "measure_pmf.variance (example_2 n xs) id ≤ 2 * F2 xs^2" (is "?L2 ≤ ?R2")
proof -
  have "?L1 = (∫h. (∑x ← xs. h x)^2 ∂sample_pro (ℋ 4 n (ℒ [-1,1])))"
    by (simp add:map_pmf_def[symmetric])
  also have "... = ?R1"
    using assms by (intro f2_exp_h) auto
  finally show "?L1 = ?R1" by simp

  have "?L2 = measure_pmf.variance (sample_pro (ℋ 4 n (ℒ [-1,1]))) (λh. (∑x ← xs. h x)^2)"
    by (simp add:map_pmf_def[symmetric])
  also have "... ≤ ?R2"
    using assms by (intro f2_var_h) auto
  finally show "?L2 ≤ ?R2" by simp
qed

text ‹The following version replaces the deterministic construction of the pseudorandom object
with a randomized one. This algorithm is much faster, but the correctness proof is more difficult.›

fun example_3 :: "nat ⇒ nat list ⇒ real pmf"
  where "example_3 n xs =
    do {
      h ← sample_pro =<< ℋ⇩_P 4 n (ℒ [-1,1]);
      return_pmf ((∑x ← xs. h x)^2)
    }"

lemma
  assumes "set xs ⊆ {0..<n}"
  shows
    "measure_pmf.expectation (example_3 n xs) id = F2 xs" (is "?L1 = ?R1")
    "measure_pmf.variance (example_3 n xs) id ≤ 2 * F2 xs^2" (is "?L2 ≤ ?R2")
proof -
  let ?p = "ℋ⇩_P 4 n (ℒ [-1,1::real])"
  let ?q = "bind_pmf ?p sample_pro"

  have "¦h x¦ ≤ 1" if that1: "M ∈ set_pmf ?p" "h ∈ pro_set M" "x ∈ set xs" for h M x
  proof -
    obtain i where 1:"h = pro_select M i"
      using that1(2) unfolding set_sample_pro[of "M"] by auto
    have "h x ∈ pro_set (ℒ [-1,1::real])"
      unfolding 1 using that(1) by (intro hash_pro_pmf_range[OF prime_power_ls])  auto
    thus ?thesis by (auto simp: list_pro_set)
  qed

  hence 0: "bounded ((λxa. xa x) ` set_pmf ?q)" if "x ∈ set xs" for x
    using that by (intro boundedI[where B="1"]) auto

  have "(∫h. (∑x ← xs. h x)^2 ∂?q) = (∫s. (∫h. (∑x ← xs. h x)^2 ∂sample_pro s) ∂?p)"
    by (intro integral_bind_pmf bounded_pow bounded_sum_list 0)
  also have "... = (∫s. F2 xs ∂?p)"
    by (intro integral_cong_AE AE_pmfI f2_exp_hp[OF _ assms]) simp_all
  also have "... = ?R1" by simp
  finally have a:"(∫h. (∑x ← xs. h x)^2 ∂?q) = ?R1" by simp
  thus "?L1 = ?R1" by (simp add:map_pmf_def[symmetric])

  have "?L2 = measure_pmf.variance ?q (λh. (∑x ← xs. h x)^2)"
    by (simp add:map_pmf_def[symmetric])
  also have "... = (∫h. ((∑x ← xs. h x)^2)^2 ∂?q) -  (∫h. (∑x ← xs. h x)^2 ∂?q)^2"
    by (intro measure_pmf.variance_eq integrable_bounded_pmf bounded_pow bounded_sum_list 0)
  also have "... = (∫s. (∫h. ((∑x ← xs. h x)^2)^2 ∂sample_pro s) ∂?p)- (F2 xs)⇧²"
    unfolding a
    by (intro arg_cong2[where f="(-)"] integral_bind_pmf refl bounded_pow bounded_sum_list 0)
  also have "... ≤ (∫s. 3*F2 xs^2  ∂?p) - (F2 xs)^2"
    by (intro diff_mono integral_mono_AE' AE_pmfI f2_exp_sq_hp[OF _ assms]) simp_all
  also have "... = ?R2" by simp
  finally show "?L2 ≤ ?R2" by simp
qed

context
  fixes ε δ :: real
  assumes ε_gt_0: "ε > 0"
  assumes δ_range: "δ ∈ {0<..<1}"
begin

text ‹By using the mean of many independent parallel estimates the following algorithm
achieves a relative accuracy of $\varepsilon$, with probability $\frac{3}{4}$.
It requires $\mathcal O(\varepsilon^{-2} (\ln n + \ln m))$ bits of space.›

fun example_4 :: "nat ⇒ nat list ⇒ real pmf"
  where "example_4 n xs =
    do {
      let s = nat ⌈8 / ε^2⌉;
      h ← prod_pmf {0..<s} (λ_. sample_pro (ℋ 4 n (ℒ [-1,1])));
      return_pmf ((∑j < s. (∑x ← xs. h j x)^2)/s)
    }"

lemma example_4_correct_aux:
  assumes "set xs ⊆ {0..<n}"
  defines "s ≡ nat ⌈8 / ε^2⌉"
  defines "R ≡ (λh :: nat ⇒ nat ⇒ real. (∑j<s. (∑x←xs. h j x)^2)/real s)"
  assumes fin: "finite (set_pmf p)"
  assumes indep: "prob_space.k_wise_indep_vars (measure_pmf p) 2 (λ_. discrete) (λi x. x i) {..<s}"
  assumes comp: "⋀i. i < s ⟹ map_pmf (λx. x i) p = sample_pro (ℋ 4 n (ℒ [-1,1]))"
  shows "measure p {h. ¦R h - F2 xs¦ > ε * F2 xs} ≤ 1/4" (is "?L ≤ ?R")
proof (cases "xs = []")
  case True thus ?thesis by (simp add:R_def F2_def)
next
  case False
  note f2_gt_0 = F2_definite[OF False]
  let ?p = "sample_pro (ℋ 4 n (ℒ [-1,1::real]))"

  have [simp]: "integrable (measure_pmf p) f" for f :: "_ ⇒ real"
    by (intro integrable_measure_pmf_finite fin)

  have "8 / ε⇧² > 0" using ε_gt_0 by (intro divide_pos_pos) auto
  hence 0:"⌈8 / ε⇧²⌉ > 0" by simp
  hence 1: "s > 0" unfolding s_def by simp

  have "(∫h. R h ∂p) = (∑j<s. (∫h. (∑x←xs. h j x)^2 ∂p))/real s" unfolding R_def by simp
  also have "... = (∑j<s. (∫h. (∑x←xs. h x)^2 ∂(map_pmf(λh. h j)p)))/real s" by simp
  also have "... = (∑j<s. (∫h. (∑x←xs. h x)^2 ∂?p))/real s"
    by (intro sum.cong arg_cong2[where f="(/)"] refl) (simp add: comp)
  also have "... = F2 xs" using 1 unfolding f2_exp_h[OF assms(1)] by simp
  finally have exp_R: "(∫h. R h ∂p) = F2 xs" by simp

  have "measure_pmf.variance p R = measure_pmf.variance p (λh. (∑j<s. (∑x←xs. h j x)^2))/s^2"
    unfolding R_def by (subst measure_pmf.variance_divide) simp_all
  also have "... = (∑j<s. measure_pmf.variance p (λh. (∑x←xs. h j x)^2))/real s^2"
    by (intro arg_cong2[where f="(/)"] refl measure_pmf.bienaymes_identity_pairwise_indep_2
        prob_space.indep_vars_compose2[OF _ prob_space.k_wise_indep_vars_subset[OF _ indep]]
        prob_space_measure_pmf) (auto intro:finite_subset)
  also have "... = (∑j<s. measure_pmf.variance(map_pmf(λh. h j)p)(λh. (∑x←xs. h x)^2))/real s^2"
    by simp
  also have "... = (∑j<s. measure_pmf.variance ?p (λh. (∑x←xs. h x)^2))/ real s^2"
    by (intro sum.cong arg_cong2[where f="(/)"] refl) (simp add: comp)
  also have "... ≤ (∑j<s. 2 * F2 xs^2)/real s^2"
    by (intro divide_right_mono sum_mono f2_var_h[OF assms(1)]) simp
  also have "... = 2 * F2 xs^2/real s" by (simp add:power2_eq_square divide_simps)
  also have "... = 2 * F2 xs^2/ ⌈8/ε^2⌉"
    using less_imp_le[OF 0] unfolding s_def by (subst of_nat_nat) auto
  also have "... ≤ 2 * F2 xs^2 / (8/ ε^2)"
    using ε_gt_0 by (intro divide_left_mono mult_pos_pos) simp_all
  also have "... = ε^2 * F2 xs^2/4" by simp
  finally have var_R: "measure_pmf.variance p R ≤ ε^2 * F2 xs^2/4" by simp

  have "(∫h. R h ∂p) = (∑j<s. (∫h. (∑x←xs. h j x)^2 ∂p))/real s" unfolding R_def by simp
  also have "... = (∑j<s. (∫h. (∑x←xs. h x)^2 ∂(map_pmf(λh. h j)p)))/real s" by simp
  also have "... = (∑j<s. (∫h. (∑x←xs. h x)^2 ∂?p))/real s"
    by (intro sum.cong arg_cong2[where f="(/)"] refl) (simp add:comp)
  also have "... = F2 xs" using 1 unfolding f2_exp_h[OF assms(1)] by simp
  finally have exp_R: "(∫h. R h ∂p) = F2 xs" by simp

  have "?L ≤ measure p {h. ¦R h - F2 xs¦ ≥ ε * F2 xs}" by (intro pmf_mono) auto
  also have "... ≤ 𝒫(h in p. ¦R h - (∫h. R h ∂p)¦ ≥ ε * F2 xs)" unfolding exp_R by simp
  also have "... ≤ measure_pmf.variance p R / (ε * F2 xs)^2"
    using f2_gt_0 ε_gt_0 by (intro measure_pmf.Chebyshev_inequality) simp_all
  also have "... ≤ (ε^2 * F2 xs^2/4) /  (ε * F2 xs)^2"
    by (intro divide_right_mono var_R) simp
  also have "... = 1/4" using ε_gt_0 f2_gt_0 by (simp add:divide_simps)
  finally show ?thesis by simp
qed

lemma example_4_correct:
  assumes "set xs ⊆ {0..<n}"
  shows "𝒫(ω in example_4 n xs. ¦ω - F2 xs¦ > ε * F2 xs) ≤ 1/4" (is "?L ≤ ?R")
proof -
  define s :: nat where "s = nat ⌈8 / ε^2⌉"
  define R where "R h = (∑j<s. (∑x←xs. h j x)^2)/s" for h :: "nat ⇒ nat ⇒ real"

  let ?p = "sample_pro (ℋ 4 n (ℒ [-1,1::real]))"
  let ?q = "prod_pmf {..<s} (λ_. ?p)"

  have "?L = (∫h. indicator {h. ¦R h - F2 xs¦ > ε * F2 xs} h ∂ ?q)"
    by (simp add:Let_def measure_bind_pmf R_def s_def indicator_def atLeast0LessThan)
  also have "... = measure ?q {h. ¦R h - F2 xs¦ > ε * F2 xs}" by simp
  also have "... ≤ ?R" unfolding R_def s_def
    by (intro example_4_correct_aux[OF assms] prob_space.k_wise_indep_vars_triv
        prob_space_measure_pmf indep_vars_Pi_pmf)
     (auto intro: finite_pro_set simp add:Pi_pmf_component set_Pi_pmf)
  finally show ?thesis by simp
qed

text ‹Instead of independent samples, we can choose the seeds using a second
pair-wise independent pseudorandom object. This algorithm requires only
$\mathcal O(\ln n + \varepsilon^{-2} \ln m)$ bits of space.›

fun example_5 :: "nat ⇒ nat list ⇒ real pmf"
  where "example_5 n xs =
    do {
      let s = nat ⌈8 / ε^2⌉;
      h ← sample_pro (ℋ 2 s (ℋ 4 n (ℒ [-1,1])));
      return_pmf ((∑j < s. (∑x ← xs. h j x)^2)/s)
    }"

lemma example_5_correct_aux:
  assumes "set xs ⊆ {0..<n}"
  defines "s ≡ nat ⌈8 / ε^2⌉"
  defines "R ≡ (λh :: nat ⇒ nat ⇒ real. (∑j<s. (∑x←xs. h j x)^2)/real s)"
  shows "measure (sample_pro (ℋ 2 s (ℋ 4 n (ℒ [-1,1])))) {h. ¦R h - F2 xs¦ > ε * F2 xs} ≤ 1/4"
proof -
  let ?p = "sample_pro (ℋ 2 s (ℋ 4 n (ℒ [-1,1::real])))"

  have "prob_space.k_wise_indep_vars ?p 2 (λ_. discrete) (λi x. x i) {..<s}"
    using hash_pro_indep[OF prime_power_h2]
    by (simp add: prob_space.k_wise_indep_vars_def[OF prob_space_measure_pmf])

  thus ?thesis unfolding R_def s_def
    by (intro example_4_correct_aux[OF assms(1)] finite_pro_set)
      (simp_all add:hash_pro_component[OF prime_power_h2])
qed

lemma example_5_correct:
  assumes "set xs ⊆ {0..<n}"
  shows "𝒫(ω in example_5 n xs. ¦ω - F2 xs¦ > ε * F2 xs) ≤ 1/4" (is "?L ≤ ?R")
proof -
  define s :: nat where "s = nat ⌈8 / ε^2⌉"
  define R where "R h = (∑j<s. (∑x←xs. h j x)^2)/s" for h :: "nat ⇒ nat ⇒ real"

  let ?p = "sample_pro (ℋ 2 s (ℋ 4 n (ℒ [-1,1::real])))"

  have "?L = (∫h. indicator {h. ¦R h - F2 xs¦ > ε * F2 xs} h ∂ ?p)"
    by (simp add:Let_def measure_bind_pmf R_def s_def indicator_def)
  also have "... = measure ?p {h. ¦R h - F2 xs¦ > ε * F2 xs}" by simp
  also have "... ≤ ?R" unfolding R_def s_def by (intro example_5_correct_aux[OF assms])
  finally show ?thesis by simp
qed

text ‹The following algorithm improves on the previous one, by achieving a success probability
of $\delta$. This works by taking the median of $\mathcal O(\ln(\delta^{-1}))$ parallel independent
samples. It requires $\mathcal O(\ln(\delta^{-1}) (\ln n + \varepsilon^{-2} \ln m))$ bits of space.›

fun example_6 :: "nat ⇒ nat list ⇒ real pmf"
  where "example_6 n xs =
    do {
      let s = nat ⌈8 / ε^2⌉; let t = nat ⌈8 * ln (1/δ)⌉;
      h ← prod_pmf {0..<t} (λ_. sample_pro (ℋ 2 s (ℋ 4 n (ℒ [-1,1]))));
      return_pmf (median t (λi. ((∑j < s. (∑x ← xs. h i j x)^2)/ s)))
    }"

lemma example_6_correct:
  assumes "set xs ⊆ {0..<n}"
  shows "𝒫(ω in example_6 n xs. ¦ω - F2 xs¦ > ε * F2 xs) ≤ δ" (is "?L ≤ ?R")
proof -
  define s where "s = nat ⌈8 / ε^2⌉"
  define t where "t = nat ⌈8 * ln(1/δ)⌉"
  define R where "R h = (∑j<s. (∑x←xs. h j x)^2)/s" for h :: "nat ⇒ nat ⇒ real"
  define I where "I = {w. ¦w - F2 xs¦ ≤ ε*F2 xs}"

  have "8 * ln (1 / δ) > 0" using δ_range by (intro mult_pos_pos ln_gt_zero) auto
  hence t_gt_0: "t > 0" unfolding t_def by simp
  have int_I: "interval I" unfolding interval_def I_def by auto

  let ?p = "sample_pro (ℋ 2 s (ℋ 4 n (ℒ [-1,1::real])))"
  let ?q = "prod_pmf {0..<t} (λ_. ?p)"

  have "(∫h. (of_bool (R h ∉ I)::real) ∂?p) = (∫h. indicator {h. R h ∉ I} h ∂?p)"
    unfolding of_bool_def indicator_def by simp
  also have "... = measure ?p {h. R h ∉ I}" by simp
  also have "... ≤ 1/4"
    using example_5_correct_aux[OF assms] unfolding R_def s_def I_def by (simp add:not_le)
  finally have 0: "(∫h. (of_bool (R h ∉ I)::real) ∂?p) ≤ 1/4" by simp

  have "?L =  (∫h. indicator {h. ¦median t (λi. R (h i)) - F2 xs¦ > ε * F2 xs} h ∂ ?q)"
    by (simp add:Let_def measure_bind_pmf R_def s_def indicator_def t_def)
  also have "... = measure ?q {h. median t (λi. R (h i)) ∉ I}"
    unfolding I_def by (simp add:not_le)
  also have "... ≤ measure ?q {h. t ≤ 2 * card {k. k < t ∧ R (h k) ∉ I}}"
    using median_est_rev[OF int_I] by (intro pmf_mono) auto
  also have "... = measure ?q {h. (∑k<t. of_bool(R (h k) ∉ I))/real t - 1/4 ≥ (1/4)}"
    using t_gt_0 by (intro arg_cong2[where f="measure"]) (auto simp:Int_def divide_simps)
  also have "... ≤ exp ( - 2 * real t * (1/4)^2)"
    by (intro classic_chernoff_bound_one_sided t_gt_0 AE_pmfI 0) auto
  also have "... = exp (- (real t / 8))" using t_gt_0 by (simp add:power2_eq_square)
  also have "... ≤ exp (- of_int ⌈8 * ln (1 / δ)⌉ / 8)" unfolding t_def
    by (intro iffD2[OF exp_le_cancel_iff] divide_right_mono iffD2[OF neg_le_iff_le]) auto
  also have "... ≤ exp (- (8 * ln (1 / δ)) / 8)"
    by (intro iffD2[OF exp_le_cancel_iff] divide_right_mono iffD2[OF neg_le_iff_le]) auto
  also have "... = exp (- ln (1 / δ))" by simp
  also have "... = δ" using δ_range by (subst ln_div) auto
  finally show ?thesis by simp
qed

text ‹The following algorithm uses an expander random walk, instead of independent samples.
It requires only $\mathcal O(\ln n + \ln(\delta^{-1}) \varepsilon^{-2} \ln m)$ bits of space.›

fun example_7 :: "nat ⇒ nat list ⇒ real pmf"
  where "example_7 n xs =
    do {
      let s = nat ⌈8 / ε^2⌉; let t = nat ⌈32 * ln (1/δ)⌉;
      h ← sample_pro (ℰ t (1/8) (ℋ 2 s (ℋ 4 n (ℒ [-1,1]))));
      return_pmf (median t (λi. ((∑j < s. (∑x ← xs. h i j x)^2)/ s)))
    }"

lemma example_7_correct:
  assumes "set xs ⊆ {0..<n}"
  shows "𝒫(ω in example_7 n xs. ¦ω - F2 xs¦ > ε * F2 xs) ≤ δ" (is "?L ≤ ?R")
proof -
  define s t where s_def: "s = nat ⌈8 / ε^2⌉" and t_def: "t = nat ⌈32 * ln(1/δ)⌉"
  define R where "R h = (∑j<s. (∑x←xs. h j x)^2)/s" for h :: "nat ⇒ nat ⇒ real"
  define I where "I = {w. ¦w - F2 xs¦ ≤ ε*F2 xs}"

  have "8 * ln (1 / δ) > 0" using δ_range by (intro mult_pos_pos ln_gt_zero) auto
  hence t_gt_0: "t > 0" unfolding t_def by simp
  have int_I: "interval I" unfolding interval_def I_def by auto

  let ?p = "sample_pro (ℋ 2 s (ℋ 4 n (ℒ [-1,1::real])))"
  let ?q = "sample_pro (ℰ t (1/8) (ℋ 2 s (ℋ 4 n (ℒ [-1,1]))))"

  have "(∫h. (of_bool (R h ∉ I)::real) ∂?p) = (∫h. indicator {h. R h ∉ I} h ∂?p)"
    by (simp add:of_bool_def indicator_def)
  also have "... = measure ?p {h. R h ∉ I}" by simp
  also have "... ≤ 1/4"
    using example_5_correct_aux[OF assms] unfolding R_def s_def I_def by (simp add:not_le)
  finally have *: "(∫h. (of_bool (R h ∉ I)::real) ∂?p) ≤ 1/4" by simp

  have "?L =  (∫h. indicator {h. ¦median t (λi. R (h i)) - F2 xs¦ > ε * F2 xs} h ∂ ?q)"
    by (simp add:Let_def measure_bind_pmf R_def s_def indicator_def t_def)
  also have "... = measure ?q {h. median t (λi. R (h i)) ∉ I}"
    unfolding I_def by (simp add:not_le)
  also have "... ≤ measure ?q {h. t ≤ 2 * card {k. k < t ∧ R (h k) ∉ I}}"
    using median_est_rev[OF int_I] by (intro pmf_mono) auto
  also have "... = measure ?q {h. 1/8 + 1/8 ≤ (∑k<t. of_bool(R (h k) ∉ I))/real t - 1/4}"
    using t_gt_0 by (intro arg_cong2[where f="measure"] Collect_cong refl)
     (auto simp add:of_bool_def sum.If_cases Int_def field_simps)
  also have "... ≤ exp (- 2 * real t * (1/8)⇧²)"
    by (intro expander_chernoff_bound_one_sided t_gt_0 *) auto
  also have "... = exp (- (real t / 32))" using t_gt_0 by (simp add:power2_eq_square)
  also have "... ≤ exp (- of_int ⌈32 * ln (1 / δ)⌉ / 32)" unfolding t_def
    by (intro iffD2[OF exp_le_cancel_iff] divide_right_mono iffD2[OF neg_le_iff_le]) auto
  also have "... ≤ exp (- (32 * ln (1 / δ)) / 32)"
    by (intro iffD2[OF exp_le_cancel_iff] divide_right_mono iffD2[OF neg_le_iff_le]) auto
  also have "... = exp (- ln (1 / δ))" by simp
  also have "... = δ" using δ_range by (subst ln_div) auto
  finally show ?thesis by simp
qed

end

end