Theory Tutorial_Pseudorandom_Objects
section ‹Tutorial on the use of Pseudorandom-Objects›
theory Tutorial_Pseudorandom_Objects
imports
Universal_Hash_Families.Pseudorandom_Objects_Hash_Families
Expander_Graphs.Pseudorandom_Objects_Expander_Walks
Equivalence_Relation_Enumeration.Equivalence_Relation_Enumeration
Median_Method.Median
Concentration_Inequalities.Bienaymes_Identity
Frequency_Moments.Frequency_Moments
begin
text ‹This section is a tutorial for the use of pseudorandom objects. Starting from the
approximation algorithm for the second frequency moment by Alon et al.~\cite{alon1999}, we will
improve the solution until we achieve a space complexity of
$\mathcal O(\ln n + \varepsilon^{-2} \ln(\delta^{-1}) \ln m)$, where $n$ denotes the range of the
stream elements, $m$ denotes the length of the stream, $\varepsilon$ denotes the desired accuracy
and $\delta$ denotes the desired failure probability.
The construction relies on a combination of pseudorandom object, in particular an expander walk
and two chained hash families.›
hide_const (open) topological_space_class.discrete
hide_const (open) Abstract_Rewriting.restrict
hide_fact (open) Abstract_Rewriting.restrict_def
hide_fact (open) Henstock_Kurzweil_Integration.integral_cong
hide_fact (open) Henstock_Kurzweil_Integration.integral_mult_right
hide_fact (open) Henstock_Kurzweil_Integration.integral_diff
text ‹The following lemmas show a one-side and two-sided Chernoff-bound for $\{0,1\}$-valued
independent identically distributed random variables. This to show the similarity with expander
walks, for which similar bounds can be established:
@{thm [source] expander_chernoff_bound_one_sided} and @{thm [source] expander_chernoff_bound}.›
lemma classic_chernoff_bound_one_sided:
fixes l :: nat
assumes "AE x in measure_pmf p. f x ∈ {0,1::real}"
assumes "(∫x. f x ∂p) ≤ μ" "l > 0" "γ ≥ 0"
shows "measure (prod_pmf {0..<l} (λ_. p)) {w. (∑i<l. f (w i))/l-μ≥γ} ≤ exp (- 2 * real l * γ^2)"
(is "?L ≤ ?R")
proof -
define ν where "ν = real l*(∫x. f x ∂p)"
let ?p = "prod_pmf {0..<l} (λ_. p)"
have 1: "prob_space.indep_vars (measure_pmf ?p) (λ_. borel) (λi x. f (x i)) {0..<l}"
by (intro prob_space.indep_vars_compose2[OF _ indep_vars_Pi_pmf] prob_space_measure_pmf) auto
have "f (y i) ∈ {0..1}" if "y ∈ {0..<l} →⇩E set_pmf p" "i ∈ {0..<l}" for y i
proof -
have "y i ∈ set_pmf p" using that by auto
thus ?thesis using assms(1) unfolding AE_measure_pmf_iff by auto
qed
hence 2: "AE x in measure_pmf ?p. f (x i) ∈ {0..1}"
if "i ∈ {0..<l}" for i
using that by (intro AE_pmfI) (auto simp: set_prod_pmf)
have "(∑i=0..<l. (∫x. f (x i) ∂?p)) = (∑i<l. (∫x. f x ∂map_pmf (λx. x i) ?p))"
by (auto simp:atLeast0LessThan)
also have "... = (∑i<l. (∫x. f x ∂p))" by (subst Pi_pmf_component) auto
also have "... = ν" unfolding ν_def by simp
finally have 3: "(∑i=0..<l. (∫x. f (x i) ∂prod_pmf {0..<l} (λ_. p))) = ν" by simp
have 4: "ν ≤ real l * μ" unfolding ν_def using assms(2) by (simp add: mult_le_cancel_left)
interpret Hoeffding_ineq "measure_pmf ?p" "{0..<l}" "λi x. f (x i)" "(λ_. 0)" "(λ_. 1)" ν
using 1 2 unfolding 3 by unfold_locales auto
have "?L ≤ measure ?p {x. (∑i=0..<l. f (x i)) ≥ real l*μ + real l*γ}"
using assms(3) by (intro pmf_mono) (auto simp:field_simps atLeast0LessThan)
also have "... ≤ measure ?p {x ∈ space ?p. (∑i=0..<l. f (x i)) ≥ ν + real l*γ}"
using 4 by (intro pmf_mono) auto
also have "... ≤ exp (- 2 * (real l * γ)^2 / (∑i=0..<l. (1 - 0)⇧2))"
using assms(3,4) by (intro Hoeffding_ineq_ge) auto
also have "... = ?R" using assms(3) by (simp add:power2_eq_square)
finally show ?thesis by simp
qed
lemma classic_chernoff_bound:
assumes "AE x in measure_pmf p. f x ∈ {0,1::real}" "l > (0::nat)" "γ ≥ 0"
defines "μ ≡ (∫x. f x ∂p)"
shows "measure (prod_pmf {0..<l} (λ_. p)) {w. ¦(∑i<l. f (w i))/l-μ¦≥γ} ≤ 2*exp (-2*real l*γ^2)"
(is "?L ≤ ?R")
proof -
have [simp]:"integrable p f" using assms(1) unfolding AE_measure_pmf_iff
by (intro integrable_bounded_pmf boundedI[where B="1"]) auto
let ?w = "prod_pmf {0..<l} (λ_. p)"
have "?L ≤ measure ?w {w. (∑i<l. f (w i))/l-μ≥γ} + measure ?w {w. (∑i<l. f (w i))/l-μ≤-(γ)}"
by (intro pmf_add) auto
also have "... ≤ exp (-2*real l*γ^2) + measure ?w {w. -((∑i<l. f (w i))/l-μ)≥γ}"
using assms by (intro add_mono classic_chernoff_bound_one_sided) (auto simp:algebra_simps)
also have "... ≤ exp (-2*real l*γ^2) + measure ?w {w. ((∑i<l. 1-f (w i))/l-(1-μ))≥γ}"
using assms(2) by (auto simp: sum_subtractf field_simps)
also have "... ≤ exp (-2*real l*γ^2) + exp (-2*real l*γ^2)"
using assms by (intro add_mono classic_chernoff_bound_one_sided) auto
also have "... = ?R" by simp
finally show ?thesis by simp
qed
text ‹Definition of the second frequency moment of a stream.›
definition F2 :: "'a list ⇒ real" where
"F2 xs = (∑ x ∈ set xs. (of_nat (count_list xs x)^2))"
lemma prime_power_ls: "is_prime_power (pro_size (ℒ [- 1, 1]))"
proof -
have "is_prime_power ((2::nat)^1)" by (intro is_prime_powerI) auto
thus "is_prime_power (pro_size (ℒ [- 1, 1]))" by (auto simp:list_pro_size numeral_eq_Suc)
qed
lemma prime_power_h2: "is_prime_power (pro_size (ℋ 4 n (ℒ [- 1, 1::real])))"
by (intro hash_pro_size_prime_power prime_power_ls) auto
abbreviation Ψ where "Ψ ≡ pmf_of_set {-1,1::real}"
lemma f2_exp:
assumes "finite (set_pmf p)"
assumes "⋀I. I ⊆ {0..<n} ⟹ card I ≤ 4 ⟹ map_pmf (λx. (λi∈I. x i)) p = prod_pmf I (λ_. Ψ)"
assumes "set xs ⊆ {0..<n::nat}"
shows "(∫h. (∑x ← xs. h x)^2 ∂p) = F2 xs" (is "?L = ?R")
proof -
let ?c = "(λx. real (count_list xs x))"
have [simp]: "integrable (measure_pmf p) f" for f :: "_ ⇒ real"
by (intro integrable_measure_pmf_finite assms)
have 0:"(∫h. h x * h y ∂p) = of_bool (x = y)"
(is "?L1 = ?R1") if "x ∈ set xs" "y ∈ set xs" for x y
proof -
have xy_lt_n: "x < n" "y < n" using assms that by auto
have card_xy: "card {x,y} ≤ 4" by (cases "x = y") auto
have "?L1 = (∫h. (h x * h y) ∂map_pmf (λf. restrict f {x,y}) p)"
by simp
also have "... = (∫h. (h x * h y) ∂prod_pmf {x,y} (λ_. Ψ))"
using xy_lt_n card_xy by (intro integral_cong assms(2) arg_cong[where f="measure_pmf"]) auto
also have "... = of_bool (x = y)" (is "?L2 = ?R2")
proof (cases "x = y")
case True
hence "?L2 = (∫h. (h x ^2) ∂prod_pmf {x} (λ_. pmf_of_set {-1,1}))"
unfolding power2_eq_square by simp
also have "... = (∫x. x^2 ∂pmf_of_set {-1,1})"
unfolding Pi_pmf_singleton by simp
also have "... = 1" by (subst integral_pmf_of_set) auto
also have "... = ?R2" using True by simp
finally show ?thesis by simp
next
case False
hence "?L2 = (∫h. (∏i∈{x,y}. h i) ∂prod_pmf {x,y} (λ_. pmf_of_set {-1,1}))" by simp
also have "... = (∏i∈{x,y}. (∫x. x ∂pmf_of_set {-1,1}))"
by (intro expectation_prod_Pi_pmf integrable_measure_pmf_finite) auto
also have "... = 0" using False by (subst integral_pmf_of_set) auto
also have "... = ?R2" using False by simp
finally show ?thesis by simp
qed
finally show ?thesis by simp
qed
have "?L = (∫h. (∑x ∈ set xs. real (count_list xs x) * h x)^2 ∂p)"
unfolding sum_list_eval by simp
also have "... = (∫h. (∑x ∈ set xs. (∑y ∈ set xs. (?c x * ?c y) * h x * h y)) ∂p)"
unfolding power2_eq_square sum_distrib_left sum_distrib_right by (simp add:ac_simps)
also have "... = (∑x ∈ set xs. (∑y ∈ set xs. (∫h. (?c x * ?c y) * h x * h y ∂p)))" by simp
also have "... = (∑x ∈ set xs. (∑y ∈ set xs. ?c x * ?c y * (∫h. h x * h y ∂p)))"
by (subst integral_mult_right[symmetric]) (simp_all add:ac_simps)
also have "... = (∑x ∈ set xs. (∑y ∈ set xs. ?c x * ?c y * of_bool (x = y)))"
by (intro sum.cong refl) (simp add: 0)
also have "... = (∑x ∈ set xs. ?c x^2) "
unfolding of_bool_def by (simp add:if_distrib if_distribR sum.If_cases power2_eq_square)
also have "... = F2 xs" unfolding F2_def by simp
finally show ?thesis by simp
qed
lemma f2_exp_sq:
assumes "finite (set_pmf p)"
assumes "⋀I. I ⊆ {0..<n} ⟹ card I ≤ 4 ⟹ map_pmf (λx. (λi∈I. x i)) p = prod_pmf I (λ_. Ψ)"
assumes "set xs ⊆ {0..<n::nat}"
shows "(∫h. ((∑x←xs. h x)^2)^2 ∂p) ≤ 3 * F2 xs^2" (is "?L ≤ ?R")
proof -
let ?c = "(λx. real (count_list xs x))"
have [simp]: "integrable (measure_pmf p) f" for f :: "_ ⇒ real"
by (intro integrable_measure_pmf_finite assms)
define S where "S = set xs"
have a: "finite S" unfolding S_def by simp
define Q :: "nat ⇒ nat ⇒ nat ⇒ nat ⇒ real"
where "Q a b c d =
of_bool(a=b∧c=d∧a≠c) + of_bool(a=c∧b=d∧a≠b) +
of_bool(a=d∧b=c∧a≠b) + of_bool(a=b∧b=c∧c=d)" for a b c d
have cases: "(∫h. h a*h b*h c*h d ∂p) = Q a b c d" (is "?L1 = ?R1")
if "a ∈ S" "b∈S" "c ∈ S" "d ∈ S" for a b c d
proof -
have "card {a,b,c,d} = card (set [a,b,c,d])" by (intro arg_cong[where f="card"]) auto
also have "... ≤ length [a,b,c,d]" by (intro card_length)
finally have card: "card {a, b, c, d} ≤ 4" by simp
have "?L1 = (∫h. h a*h b*h c*h d ∂map_pmf (λf. restrict f {a,b,c,d}) p)" by simp
also have "... = (∫h. h a*h b*h c*h d ∂prod_pmf {a,b,c,d} (λ_. Ψ))" using that assms(3)
by (intro integral_cong arg_cong[where f="measure_pmf"] assms(2) card) (auto simp:S_def)
also have "... = (∫h. (∏i ←[a,b,c,d]. h i) ∂prod_pmf {a,b,c,d} (λ_. Ψ))" by (simp add:ac_simps)
also have "... = (∫h. (∏i ∈{a,b,c,d}. h i^count_list [a,b,c,d] i) ∂prod_pmf {a,b,c,d} (λ_. Ψ))"
by (subst prod_list_eval) auto
also have "... = (∏i ∈ {a,b,c,d}. (∫x. x^count_list [a,b,c,d] i ∂Ψ))"
by (intro expectation_prod_Pi_pmf integrable_measure_pmf_finite) auto
also have "... = (∏i ∈ {a,b,c,d}. of_bool (even (count_list [a,b,c,d] i)))"
by (intro prod.cong refl) (auto simp:integral_pmf_of_set)
also have "... = (∏i ∈ set (remdups [a,b,c,d]). of_bool (even (count_list [a,b,c,d] i)))"
by (intro prod.cong refl) auto
also have "... = (∏i ← remdups [a,b,c,d]. of_bool (even (count_list [a,b,c,d] i)))"
by (intro prod.distinct_set_conv_list) auto
also have "... = Q a b c d" unfolding Q_def by simp
finally show ?thesis by simp
qed
have "?L = (∫h. (∑x ∈ S. real (count_list xs x) * h x)^4 ∂p)"
unfolding S_def sum_list_eval by simp
also have "... = (∫h. (∑a∈S. (∑b∈S.(∑c∈S.(∑d∈S.(?c a*?c b*?c c*?c d)*h a*h b*h c*h d)))) ∂p)"
unfolding power4_eq_xxxx sum_distrib_left sum_distrib_right by (simp add:ac_simps)
also have "... = (∑a∈S.(∑b∈S.(∑c∈S.(∑d∈S.(∫h. (?c a*?c b*?c c*?c d)*h a*h b*h c*h d ∂p)))))"
by simp
also have "... = (∑a∈S.(∑b∈S.(∑c∈S.(∑d∈S. (?c a*?c b*?c c*?c d)*(∫h. h a*h b*h c*h d ∂p)))))"
by (subst integral_mult_right[symmetric]) (simp_all add:ac_simps)
also have "... = (∑a∈S.(∑b∈S.(∑c∈S.(∑d∈S. (?c a*?c b*?c c*?c d)*(Q a b c d)))))"
by (intro sum.cong refl) (simp add:cases)
also have "... = 1*(∑a∈S. ?c a^4) + 3*(∑a∈S. (∑b∈S. ?c a^2 * ?c b^2 * of_bool(a≠b)))"
unfolding Q_def
by (simp add: sum.distrib distrib_left sum_collapse[OF a] ac_simps sum_distrib_left[symmetric]
power2_eq_square power4_eq_xxxx)
also have "... ≤ 3*(∑a∈S. ?c a^4) + 3*(∑a∈S. (∑b∈S. ?c a^2 * ?c b^2 * of_bool(a≠b)))"
by (intro add_mono mult_right_mono sum_nonneg) auto
also have "... = 3*(∑a∈S. (∑b∈S. ?c a^2 * ?c b^2 * (of_bool (a=b) + of_bool(a≠b))))"
using a by (simp add: sum.distrib distrib_left)
also have "... = 3*(∑a∈S. (∑b∈S. ?c a^2 * ?c b^2 * 1))"
by (intro sum.cong arg_cong2[where f="(*)"] refl) auto
also have "... = 3 * F2 xs^2" unfolding F2_def power2_eq_square
by (simp add: S_def sum_distrib_left sum_distrib_right ac_simps)
finally show "?L ≤ 3 * F2 xs^2" by simp
qed
lemma f2_var:
assumes "finite (set_pmf p)"
assumes "⋀I. I ⊆ {0..<n} ⟹ card I ≤ 4 ⟹ map_pmf (λx. (λi∈I. x i)) p = prod_pmf I (λ_. Ψ)"
assumes "set xs ⊆ {0..<n::nat}"
shows "measure_pmf.variance p (λh. (∑x←xs. h x)^2) ≤ 2* F2 xs^2"
(is "?L ≤ ?R")
proof -
have [simp]: "integrable (measure_pmf p) f" for f :: "_ ⇒ real"
by (intro integrable_measure_pmf_finite assms)
have "?L = (∫h. ((∑x←xs. h x)^2)^2 ∂p) - F2 xs^2"
by (subst measure_pmf.variance_eq) (simp_all add:f2_exp[OF assms(1-3)])
also have "... ≤ 3 * F2 xs^2 - F2 xs^2"
by (intro diff_mono f2_exp_sq[OF assms]) auto
finally show ?thesis by simp
qed
lemma
assumes "s ∈ set_pmf (ℋ⇩P 4 n (ℒ [-1,1]))"
assumes "set xs ⊆ {0..<n}"
shows f2_exp_hp: "(∫h. (∑x ← xs. h x)^2 ∂sample_pro s) = F2 xs" (is "?T1")
and f2_exp_sq_hp: "(∫h. ((∑x ← xs. h x)^2)^2 ∂sample_pro s) ≤ 3* F2 xs^2" (is "?T2")
and f2_var_hp: "measure_pmf.variance s (λh. (∑x ← xs. h x)^2) ≤ 2* F2 xs^2" (is "?T3")
proof -
have 0:"map_pmf (λx. restrict x I) (sample_pro s) = prod_pmf I (λ_. Ψ)" (is "?L = _")
if "I ⊆ {0..<n}" "card I ≤ 4" for I
proof -
have "?L = prod_pmf I (λ_. sample_pro (ℒ [- 1, 1]))"
using that by (intro hash_pro_pmf_distr[OF _ assms(1)] prime_power_ls) auto
also have "... = prod_pmf I (λ_. Ψ)" by (subst list_pro_2) auto
finally show ?thesis by simp
qed
show ?T1 by (intro f2_exp[OF _ _ assms(2)] finite_pro_set 0) simp
show ?T2 by (intro f2_exp_sq[OF _ _ assms(2)] finite_pro_set 0) simp
show ?T3 by (intro f2_var[OF _ _ assms(2)] finite_pro_set 0) simp
qed
lemmas f2_exp_h = f2_exp_hp[OF hash_pro_in_hash_pro_pmf[OF prime_power_ls]]
lemmas f2_var_h = f2_var_hp[OF hash_pro_in_hash_pro_pmf[OF prime_power_ls]]
lemma F2_definite:
assumes "xs ≠ []"
shows "F2 xs > 0"
proof -
have "0 < real (card (set xs))" using assms by (simp add: card_gt_0_iff)
also have "... = (∑x ∈ set xs. 1)" by simp
also have "... ≤ F2 xs" using count_list_gr_1 unfolding F2_def by (intro sum_mono) force
finally show ?thesis by simp
qed
text ‹The following algorithm uses a completely random function, accordingly it requires
a lot of space: $\mathcal O(n + \ln m)$.›
fun example_1 :: "nat ⇒ nat list ⇒ real pmf"
where "example_1 n xs =
do {
h ← prod_pmf {0..<n} (λ_. pmf_of_set {-1,1::real});
return_pmf ((∑x ← xs. h x)^2)
}"
lemma example_1_correct:
assumes "set xs ⊆ {0..<n}"
shows
"measure_pmf.expectation (example_1 n xs) id = F2 xs" (is "?L1 = ?R1")
"measure_pmf.variance (example_1 n xs) id ≤ 2 * F2 xs^2" (is "?L2 ≤ ?R2")
proof -
have "?L1 = (∫h. (∑x ← xs. h x)^2 ∂prod_pmf {0..<n} (λ_. Ψ))"
by (simp add:map_pmf_def[symmetric])
also have "... = ?R1" using assms by (intro f2_exp)
(auto intro: Pi_pmf_subset[symmetric] simp add:restrict_def set_Pi_pmf)
finally show "?L1 = ?R1" by simp
have "?L2 = measure_pmf.variance (prod_pmf {0..<n} (λ_. Ψ)) (λh. (∑x ← xs. h x)^2)"
by (simp add:map_pmf_def[symmetric] atLeast0LessThan)
also have "... ≤ ?R2"
using assms by (intro f2_var)
(auto intro: Pi_pmf_subset[symmetric] simp add:restrict_def set_Pi_pmf)
finally show "?L2 ≤ ?R2" by simp
qed
text ‹This version replaces a the use of completely random function with a pseudorandom object,
it requires a lot less space: $\mathcal O(\ln n + \ln m)$.›
fun example_2 :: "nat ⇒ nat list ⇒ real pmf"
where "example_2 n xs =
do {
h ← sample_pro (ℋ 4 n (ℒ [-1,1]));
return_pmf ((∑x ← xs. h x)^2)
}"
lemma example_2_correct:
assumes "set xs ⊆ {0..<n}"
shows
"measure_pmf.expectation (example_2 n xs) id = F2 xs" (is "?L1 = ?R1")
"measure_pmf.variance (example_2 n xs) id ≤ 2 * F2 xs^2" (is "?L2 ≤ ?R2")
proof -
have "?L1 = (∫h. (∑x ← xs. h x)^2 ∂sample_pro (ℋ 4 n (ℒ [-1,1])))"
by (simp add:map_pmf_def[symmetric])
also have "... = ?R1"
using assms by (intro f2_exp_h) auto
finally show "?L1 = ?R1" by simp
have "?L2 = measure_pmf.variance (sample_pro (ℋ 4 n (ℒ [-1,1]))) (λh. (∑x ← xs. h x)^2)"
by (simp add:map_pmf_def[symmetric])
also have "... ≤ ?R2"
using assms by (intro f2_var_h) auto
finally show "?L2 ≤ ?R2" by simp
qed
text ‹The following version replaces the deterministic construction of the pseudorandom object
with a randomized one. This algorithm is much faster, but the correctness proof is more difficult.›
fun example_3 :: "nat ⇒ nat list ⇒ real pmf"
where "example_3 n xs =
do {
h ← sample_pro =<< ℋ⇩P 4 n (ℒ [-1,1]);
return_pmf ((∑x ← xs. h x)^2)
}"
lemma
assumes "set xs ⊆ {0..<n}"
shows
"measure_pmf.expectation (example_3 n xs) id = F2 xs" (is "?L1 = ?R1")
"measure_pmf.variance (example_3 n xs) id ≤ 2 * F2 xs^2" (is "?L2 ≤ ?R2")
proof -
let ?p = "ℋ⇩P 4 n (ℒ [-1,1::real])"
let ?q = "bind_pmf ?p sample_pro"
have "¦h x¦ ≤ 1" if that1: "M ∈ set_pmf ?p" "h ∈ pro_set M" "x ∈ set xs" for h M x
proof -
obtain i where 1:"h = pro_select M i"
using that1(2) unfolding set_sample_pro[of "M"] by auto
have "h x ∈ pro_set (ℒ [-1,1::real])"
unfolding 1 using that(1) by (intro hash_pro_pmf_range[OF prime_power_ls]) auto
thus ?thesis by (auto simp: list_pro_set)
qed
hence 0: "bounded ((λxa. xa x) ` set_pmf ?q)" if "x ∈ set xs" for x
using that by (intro boundedI[where B="1"]) auto
have "(∫h. (∑x ← xs. h x)^2 ∂?q) = (∫s. (∫h. (∑x ← xs. h x)^2 ∂sample_pro s) ∂?p)"
by (intro integral_bind_pmf bounded_pow bounded_sum_list 0)
also have "... = (∫s. F2 xs ∂?p)"
by (intro integral_cong_AE AE_pmfI f2_exp_hp[OF _ assms]) simp_all
also have "... = ?R1" by simp
finally have a:"(∫h. (∑x ← xs. h x)^2 ∂?q) = ?R1" by simp
thus "?L1 = ?R1" by (simp add:map_pmf_def[symmetric])
have "?L2 = measure_pmf.variance ?q (λh. (∑x ← xs. h x)^2)"
by (simp add:map_pmf_def[symmetric])
also have "... = (∫h. ((∑x ← xs. h x)^2)^2 ∂?q) - (∫h. (∑x ← xs. h x)^2 ∂?q)^2"
by (intro measure_pmf.variance_eq integrable_bounded_pmf bounded_pow bounded_sum_list 0)
also have "... = (∫s. (∫h. ((∑x ← xs. h x)^2)^2 ∂sample_pro s) ∂?p)- (F2 xs)⇧2"
unfolding a
by (intro arg_cong2[where f="(-)"] integral_bind_pmf refl bounded_pow bounded_sum_list 0)
also have "... ≤ (∫s. 3*F2 xs^2 ∂?p) - (F2 xs)^2"
by (intro diff_mono integral_mono_AE' AE_pmfI f2_exp_sq_hp[OF _ assms]) simp_all
also have "... = ?R2" by simp
finally show "?L2 ≤ ?R2" by simp
qed
context
fixes ε δ :: real
assumes ε_gt_0: "ε > 0"
assumes δ_range: "δ ∈ {0<..<1}"
begin
text ‹By using the mean of many independent parallel estimates the following algorithm
achieves a relative accuracy of $\varepsilon$, with probability $\frac{3}{4}$.
It requires $\mathcal O(\varepsilon^{-2} (\ln n + \ln m))$ bits of space.›
fun example_4 :: "nat ⇒ nat list ⇒ real pmf"
where "example_4 n xs =
do {
let s = nat ⌈8 / ε^2⌉;
h ← prod_pmf {0..<s} (λ_. sample_pro (ℋ 4 n (ℒ [-1,1])));
return_pmf ((∑j < s. (∑x ← xs. h j x)^2)/s)
}"
lemma example_4_correct_aux:
assumes "set xs ⊆ {0..<n}"
defines "s ≡ nat ⌈8 / ε^2⌉"
defines "R ≡ (λh :: nat ⇒ nat ⇒ real. (∑j<s. (∑x←xs. h j x)^2)/real s)"
assumes fin: "finite (set_pmf p)"
assumes indep: "prob_space.k_wise_indep_vars (measure_pmf p) 2 (λ_. discrete) (λi x. x i) {..<s}"
assumes comp: "⋀i. i < s ⟹ map_pmf (λx. x i) p = sample_pro (ℋ 4 n (ℒ [-1,1]))"
shows "measure p {h. ¦R h - F2 xs¦ > ε * F2 xs} ≤ 1/4" (is "?L ≤ ?R")
proof (cases "xs = []")
case True thus ?thesis by (simp add:R_def F2_def)
next
case False
note f2_gt_0 = F2_definite[OF False]
let ?p = "sample_pro (ℋ 4 n (ℒ [-1,1::real]))"
have [simp]: "integrable (measure_pmf p) f" for f :: "_ ⇒ real"
by (intro integrable_measure_pmf_finite fin)
have "8 / ε⇧2 > 0" using ε_gt_0 by (intro divide_pos_pos) auto
hence 0:"⌈8 / ε⇧2⌉ > 0" by simp
hence 1: "s > 0" unfolding s_def by simp
have "(∫h. R h ∂p) = (∑j<s. (∫h. (∑x←xs. h j x)^2 ∂p))/real s" unfolding R_def by simp
also have "... = (∑j<s. (∫h. (∑x←xs. h x)^2 ∂(map_pmf(λh. h j)p)))/real s" by simp
also have "... = (∑j<s. (∫h. (∑x←xs. h x)^2 ∂?p))/real s"
by (intro sum.cong arg_cong2[where f="(/)"] refl) (simp add: comp)
also have "... = F2 xs" using 1 unfolding f2_exp_h[OF assms(1)] by simp
finally have exp_R: "(∫h. R h ∂p) = F2 xs" by simp
have "measure_pmf.variance p R = measure_pmf.variance p (λh. (∑j<s. (∑x←xs. h j x)^2))/s^2"
unfolding R_def by (subst measure_pmf.variance_divide) simp_all
also have "... = (∑j<s. measure_pmf.variance p (λh. (∑x←xs. h j x)^2))/real s^2"
by (intro arg_cong2[where f="(/)"] refl measure_pmf.bienaymes_identity_pairwise_indep_2
prob_space.indep_vars_compose2[OF _ prob_space.k_wise_indep_vars_subset[OF _ indep]]
prob_space_measure_pmf) (auto intro:finite_subset)
also have "... = (∑j<s. measure_pmf.variance(map_pmf(λh. h j)p)(λh. (∑x←xs. h x)^2))/real s^2"
by simp
also have "... = (∑j<s. measure_pmf.variance ?p (λh. (∑x←xs. h x)^2))/ real s^2"
by (intro sum.cong arg_cong2[where f="(/)"] refl) (simp add: comp)
also have "... ≤ (∑j<s. 2 * F2 xs^2)/real s^2"
by (intro divide_right_mono sum_mono f2_var_h[OF assms(1)]) simp
also have "... = 2 * F2 xs^2/real s" by (simp add:power2_eq_square divide_simps)
also have "... = 2 * F2 xs^2/ ⌈8/ε^2⌉"
using less_imp_le[OF 0] unfolding s_def by (subst of_nat_nat) auto
also have "... ≤ 2 * F2 xs^2 / (8/ ε^2)"
using ε_gt_0 by (intro divide_left_mono mult_pos_pos) simp_all
also have "... = ε^2 * F2 xs^2/4" by simp
finally have var_R: "measure_pmf.variance p R ≤ ε^2 * F2 xs^2/4" by simp
have "(∫h. R h ∂p) = (∑j<s. (∫h. (∑x←xs. h j x)^2 ∂p))/real s" unfolding R_def by simp
also have "... = (∑j<s. (∫h. (∑x←xs. h x)^2 ∂(map_pmf(λh. h j)p)))/real s" by simp
also have "... = (∑j<s. (∫h. (∑x←xs. h x)^2 ∂?p))/real s"
by (intro sum.cong arg_cong2[where f="(/)"] refl) (simp add:comp)
also have "... = F2 xs" using 1 unfolding f2_exp_h[OF assms(1)] by simp
finally have exp_R: "(∫h. R h ∂p) = F2 xs" by simp
have "?L ≤ measure p {h. ¦R h - F2 xs¦ ≥ ε * F2 xs}" by (intro pmf_mono) auto
also have "... ≤ 𝒫(h in p. ¦R h - (∫h. R h ∂p)¦ ≥ ε * F2 xs)" unfolding exp_R by simp
also have "... ≤ measure_pmf.variance p R / (ε * F2 xs)^2"
using f2_gt_0 ε_gt_0 by (intro measure_pmf.Chebyshev_inequality) simp_all
also have "... ≤ (ε^2 * F2 xs^2/4) / (ε * F2 xs)^2"
by (intro divide_right_mono var_R) simp
also have "... = 1/4" using ε_gt_0 f2_gt_0 by (simp add:divide_simps)
finally show ?thesis by simp
qed
lemma example_4_correct:
assumes "set xs ⊆ {0..<n}"
shows "𝒫(ω in example_4 n xs. ¦ω - F2 xs¦ > ε * F2 xs) ≤ 1/4" (is "?L ≤ ?R")
proof -
define s :: nat where "s = nat ⌈8 / ε^2⌉"
define R where "R h = (∑j<s. (∑x←xs. h j x)^2)/s" for h :: "nat ⇒ nat ⇒ real"
let ?p = "sample_pro (ℋ 4 n (ℒ [-1,1::real]))"
let ?q = "prod_pmf {..<s} (λ_. ?p)"
have "?L = (∫h. indicator {h. ¦R h - F2 xs¦ > ε * F2 xs} h ∂ ?q)"
by (simp add:Let_def measure_bind_pmf R_def s_def indicator_def atLeast0LessThan)
also have "... = measure ?q {h. ¦R h - F2 xs¦ > ε * F2 xs}" by simp
also have "... ≤ ?R" unfolding R_def s_def
by (intro example_4_correct_aux[OF assms] prob_space.k_wise_indep_vars_triv
prob_space_measure_pmf indep_vars_Pi_pmf)
(auto intro: finite_pro_set simp add:Pi_pmf_component set_Pi_pmf)
finally show ?thesis by simp
qed
text ‹Instead of independent samples, we can choose the seeds using a second
pair-wise independent pseudorandom object. This algorithm requires only
$\mathcal O(\ln n + \varepsilon^{-2} \ln m)$ bits of space.›
fun example_5 :: "nat ⇒ nat list ⇒ real pmf"
where "example_5 n xs =
do {
let s = nat ⌈8 / ε^2⌉;
h ← sample_pro (ℋ 2 s (ℋ 4 n (ℒ [-1,1])));
return_pmf ((∑j < s. (∑x ← xs. h j x)^2)/s)
}"
lemma example_5_correct_aux:
assumes "set xs ⊆ {0..<n}"
defines "s ≡ nat ⌈8 / ε^2⌉"
defines "R ≡ (λh :: nat ⇒ nat ⇒ real. (∑j<s. (∑x←xs. h j x)^2)/real s)"
shows "measure (sample_pro (ℋ 2 s (ℋ 4 n (ℒ [-1,1])))) {h. ¦R h - F2 xs¦ > ε * F2 xs} ≤ 1/4"
proof -
let ?p = "sample_pro (ℋ 2 s (ℋ 4 n (ℒ [-1,1::real])))"
have "prob_space.k_wise_indep_vars ?p 2 (λ_. discrete) (λi x. x i) {..<s}"
using hash_pro_indep[OF prime_power_h2]
by (simp add: prob_space.k_wise_indep_vars_def[OF prob_space_measure_pmf])
thus ?thesis unfolding R_def s_def
by (intro example_4_correct_aux[OF assms(1)] finite_pro_set)
(simp_all add:hash_pro_component[OF prime_power_h2])
qed
lemma example_5_correct:
assumes "set xs ⊆ {0..<n}"
shows "𝒫(ω in example_5 n xs. ¦ω - F2 xs¦ > ε * F2 xs) ≤ 1/4" (is "?L ≤ ?R")
proof -
define s :: nat where "s = nat ⌈8 / ε^2⌉"
define R where "R h = (∑j<s. (∑x←xs. h j x)^2)/s" for h :: "nat ⇒ nat ⇒ real"
let ?p = "sample_pro (ℋ 2 s (ℋ 4 n (ℒ [-1,1::real])))"
have "?L = (∫h. indicator {h. ¦R h - F2 xs¦ > ε * F2 xs} h ∂ ?p)"
by (simp add:Let_def measure_bind_pmf R_def s_def indicator_def)
also have "... = measure ?p {h. ¦R h - F2 xs¦ > ε * F2 xs}" by simp
also have "... ≤ ?R" unfolding R_def s_def by (intro example_5_correct_aux[OF assms])
finally show ?thesis by simp
qed
text ‹The following algorithm improves on the previous one, by achieving a success probability
of $\delta$. This works by taking the median of $\mathcal O(\ln(\delta^{-1}))$ parallel independent
samples. It requires $\mathcal O(\ln(\delta^{-1}) (\ln n + \varepsilon^{-2} \ln m))$ bits of space.›
fun example_6 :: "nat ⇒ nat list ⇒ real pmf"
where "example_6 n xs =
do {
let s = nat ⌈8 / ε^2⌉; let t = nat ⌈8 * ln (1/δ)⌉;
h ← prod_pmf {0..<t} (λ_. sample_pro (ℋ 2 s (ℋ 4 n (ℒ [-1,1]))));
return_pmf (median t (λi. ((∑j < s. (∑x ← xs. h i j x)^2)/ s)))
}"
lemma example_6_correct:
assumes "set xs ⊆ {0..<n}"
shows "𝒫(ω in example_6 n xs. ¦ω - F2 xs¦ > ε * F2 xs) ≤ δ" (is "?L ≤ ?R")
proof -
define s where "s = nat ⌈8 / ε^2⌉"
define t where "t = nat ⌈8 * ln(1/δ)⌉"
define R where "R h = (∑j<s. (∑x←xs. h j x)^2)/s" for h :: "nat ⇒ nat ⇒ real"
define I where "I = {w. ¦w - F2 xs¦ ≤ ε*F2 xs}"
have "8 * ln (1 / δ) > 0" using δ_range by (intro mult_pos_pos ln_gt_zero) auto
hence t_gt_0: "t > 0" unfolding t_def by simp
have int_I: "interval I" unfolding interval_def I_def by auto
let ?p = "sample_pro (ℋ 2 s (ℋ 4 n (ℒ [-1,1::real])))"
let ?q = "prod_pmf {0..<t} (λ_. ?p)"
have "(∫h. (of_bool (R h ∉ I)::real) ∂?p) = (∫h. indicator {h. R h ∉ I} h ∂?p)"
unfolding of_bool_def indicator_def by simp
also have "... = measure ?p {h. R h ∉ I}" by simp
also have "... ≤ 1/4"
using example_5_correct_aux[OF assms] unfolding R_def s_def I_def by (simp add:not_le)
finally have 0: "(∫h. (of_bool (R h ∉ I)::real) ∂?p) ≤ 1/4" by simp
have "?L = (∫h. indicator {h. ¦median t (λi. R (h i)) - F2 xs¦ > ε * F2 xs} h ∂ ?q)"
by (simp add:Let_def measure_bind_pmf R_def s_def indicator_def t_def)
also have "... = measure ?q {h. median t (λi. R (h i)) ∉ I}"
unfolding I_def by (simp add:not_le)
also have "... ≤ measure ?q {h. t ≤ 2 * card {k. k < t ∧ R (h k) ∉ I}}"
using median_est_rev[OF int_I] by (intro pmf_mono) auto
also have "... = measure ?q {h. (∑k<t. of_bool(R (h k) ∉ I))/real t - 1/4 ≥ (1/4)}"
using t_gt_0 by (intro arg_cong2[where f="measure"]) (auto simp:Int_def divide_simps)
also have "... ≤ exp ( - 2 * real t * (1/4)^2)"
by (intro classic_chernoff_bound_one_sided t_gt_0 AE_pmfI 0) auto
also have "... = exp (- (real t / 8))" using t_gt_0 by (simp add:power2_eq_square)
also have "... ≤ exp (- of_int ⌈8 * ln (1 / δ)⌉ / 8)" unfolding t_def
by (intro iffD2[OF exp_le_cancel_iff] divide_right_mono iffD2[OF neg_le_iff_le]) auto
also have "... ≤ exp (- (8 * ln (1 / δ)) / 8)"
by (intro iffD2[OF exp_le_cancel_iff] divide_right_mono iffD2[OF neg_le_iff_le]) auto
also have "... = exp (- ln (1 / δ))" by simp
also have "... = δ" using δ_range by (subst ln_div) auto
finally show ?thesis by simp
qed
text ‹The following algorithm uses an expander random walk, instead of independent samples.
It requires only $\mathcal O(\ln n + \ln(\delta^{-1}) \varepsilon^{-2} \ln m)$ bits of space.›
fun example_7 :: "nat ⇒ nat list ⇒ real pmf"
where "example_7 n xs =
do {
let s = nat ⌈8 / ε^2⌉; let t = nat ⌈32 * ln (1/δ)⌉;
h ← sample_pro (ℰ t (1/8) (ℋ 2 s (ℋ 4 n (ℒ [-1,1]))));
return_pmf (median t (λi. ((∑j < s. (∑x ← xs. h i j x)^2)/ s)))
}"
lemma example_7_correct:
assumes "set xs ⊆ {0..<n}"
shows "𝒫(ω in example_7 n xs. ¦ω - F2 xs¦ > ε * F2 xs) ≤ δ" (is "?L ≤ ?R")
proof -
define s t where s_def: "s = nat ⌈8 / ε^2⌉" and t_def: "t = nat ⌈32 * ln(1/δ)⌉"
define R where "R h = (∑j<s. (∑x←xs. h j x)^2)/s" for h :: "nat ⇒ nat ⇒ real"
define I where "I = {w. ¦w - F2 xs¦ ≤ ε*F2 xs}"
have "8 * ln (1 / δ) > 0" using δ_range by (intro mult_pos_pos ln_gt_zero) auto
hence t_gt_0: "t > 0" unfolding t_def by simp
have int_I: "interval I" unfolding interval_def I_def by auto
let ?p = "sample_pro (ℋ 2 s (ℋ 4 n (ℒ [-1,1::real])))"
let ?q = "sample_pro (ℰ t (1/8) (ℋ 2 s (ℋ 4 n (ℒ [-1,1]))))"
have "(∫h. (of_bool (R h ∉ I)::real) ∂?p) = (∫h. indicator {h. R h ∉ I} h ∂?p)"
by (simp add:of_bool_def indicator_def)
also have "... = measure ?p {h. R h ∉ I}" by simp
also have "... ≤ 1/4"
using example_5_correct_aux[OF assms] unfolding R_def s_def I_def by (simp add:not_le)
finally have *: "(∫h. (of_bool (R h ∉ I)::real) ∂?p) ≤ 1/4" by simp
have "?L = (∫h. indicator {h. ¦median t (λi. R (h i)) - F2 xs¦ > ε * F2 xs} h ∂ ?q)"
by (simp add:Let_def measure_bind_pmf R_def s_def indicator_def t_def)
also have "... = measure ?q {h. median t (λi. R (h i)) ∉ I}"
unfolding I_def by (simp add:not_le)
also have "... ≤ measure ?q {h. t ≤ 2 * card {k. k < t ∧ R (h k) ∉ I}}"
using median_est_rev[OF int_I] by (intro pmf_mono) auto
also have "... = measure ?q {h. 1/8 + 1/8 ≤ (∑k<t. of_bool(R (h k) ∉ I))/real t - 1/4}"
using t_gt_0 by (intro arg_cong2[where f="measure"] Collect_cong refl)
(auto simp add:of_bool_def sum.If_cases Int_def field_simps)
also have "... ≤ exp (- 2 * real t * (1/8)⇧2)"
by (intro expander_chernoff_bound_one_sided t_gt_0 *) auto
also have "... = exp (- (real t / 32))" using t_gt_0 by (simp add:power2_eq_square)
also have "... ≤ exp (- of_int ⌈32 * ln (1 / δ)⌉ / 32)" unfolding t_def
by (intro iffD2[OF exp_le_cancel_iff] divide_right_mono iffD2[OF neg_le_iff_le]) auto
also have "... ≤ exp (- (32 * ln (1 / δ)) / 32)"
by (intro iffD2[OF exp_le_cancel_iff] divide_right_mono iffD2[OF neg_le_iff_le]) auto
also have "... = exp (- ln (1 / δ))" by simp
also have "... = δ" using δ_range by (subst ln_div) auto
finally show ?thesis by simp
qed
end
end