Theory CVM_New_Unbiased_Algorithm
section ‹The New Unbiased Algorithm\label{sec:cvm_new}›
text ‹In this section, we introduce the new algorithm variant promised in the abstract.
The main change is to replace the subsampling step of the original algorithm, which removes each
element of the buffer independently with probability $f$.
Instead, we choose a random $nf$-subset of the buffer (see Algorithm~\ref{alg:cvm_new}).
(This means $f$, $n$ must be chosen, such that $nf$ is an integer.)
\begin{algorithm}[h!]
\caption{New CVM algorithm.\label{alg:cvm_new}}
\begin{algorithmic}[1]
\Require Stream elements $a_1,\ldots,a_l$, $0 < \varepsilon$, $0 < \delta < 1$, $f$ subsampling param.
\Ensure An estimate $R$, s.t., $\prob \left( | R - |A| | > \varepsilon |A| \right) \leq \delta$ where $A := \{a_1,\ldots,a_l\}.$
\State $\chi \gets \{\}, p \gets 1, n \geq \left\lceil \frac{12}{\varepsilon^2} \ln(\frac{3l}{\delta}) \right\rceil$
\For{$i \gets 1$ to $l$}
\State $b \getsr \Ber(p)$ \Comment insert $a_i$ with probability $p$ (and remove it otherwise)
\If{$b$}
\State $\chi \gets \chi \cup \{a_i\}$
\Else
\State $\chi \gets \chi - \{a_i\}$
\EndIf
\If{$|\chi| = n$}
\State $\chi \getsr \mathrm{subsample}(\chi)$ \Comment Choose a random $nf$-subset of $\chi$
\State $p \gets p f$
\EndIf
\EndFor
\State \Return $\frac{|\chi|}{p}$ \Comment estimate cardinality of $A$
\end{algorithmic}
\end{algorithm}
The fact that this still preserves the required inequality for the subsampling operation
(Eq.~\ref{eq:subsample_condition}) follows from the negative associativity of permutation
distributions~\cite[Th. 10]{dubhashi1996}.
(See also our formalization of the concept~\cite{Negative_Association-AFP}.)
Because the subsampling step always removes elements unconditionally, the second check, whether
the subsampling succeeded of the original algorithm is not necessary anymore.
This improves the space usage of the algorithm, because the first reduction argument from
Section~\ref{sec:cvm_original} is now obsolete. Moreover the resulting algorithm is now
unbiased, because it is an instance of the abstract algorithm of Section~\ref{sec:cvm_abs}.›
theory CVM_New_Unbiased_Algorithm
imports
CVM_Abstract_Algorithm
Probabilistic_Prime_Tests.Generalized_Primality_Test
Negative_Association.Negative_Association_Permutation_Distributions
begin
unbundle no_vec_syntax
context
fixes f :: real and n :: nat
assumes f_range: ‹f ∈ {1/2..<1}› ‹n * f ∈ ℕ› and n_gt_0: ‹n > 0›
begin
definition ‹initial_state = State {} 1›
fun subsample where
‹subsample χ = pmf_of_set {S. S ⊆ χ ∧ card S = n * f}›
fun step where
‹step a (State χ p) = do {
b ← bernoulli_pmf p;
let χ = (if b then χ ∪ {a} else χ - {a});
if card χ = n then do {
χ ← subsample χ;
return_pmf (State χ (p * f))
} else do {
return_pmf (State χ p)
}
}›
fun run_steps where
‹run_steps xs = foldM_pmf step xs initial_state›
fun estimate where
‹estimate (State χ p) = card χ / p›
fun run_algo where
‹run_algo xs = map_pmf estimate (run_steps xs)›
definition ‹subsample_size = (THE x. real x = n * f)›
declare subsample.simps [simp del]
lemma subsample_size_eq:
‹real subsample_size = n * f›
proof -
obtain a where a_def:‹real a = real n * f› using f_range(2) by (metis Nats_cases)
show ?thesis
unfolding subsample_size_def using a_def
by (rule theI2[where a=‹a›]) (use a_def in auto)
qed
lemma subsample_size:
‹subsample_size < n› ‹2 * subsample_size ≥ n›
proof (goal_cases)
case 1
have ‹real subsample_size < real n›
unfolding subsample_size_eq using f_range(1) n_gt_0 by auto
thus ?case by simp
next
case 2
have ‹real n ≤ 2 * real subsample_size›
using f_range(1) n_gt_0 unfolding subsample_size_eq by auto
thus ?case by simp
qed
lemma subsample_finite_nonempty:
assumes ‹card U = n›
shows
‹{T. T ⊆ U ∧ card T = subsample_size} ≠ {}› (is ‹?C ≠ {}›)
‹finite {T. T ⊆ U ∧ card T = subsample_size}›
‹subsample U = pmf_of_set {T. T ⊆ U ∧ card T = subsample_size}›
‹finite (set_pmf (subsample U))›
proof -
have fin_U: ‹finite U› using assms subsample_size
by (meson card_gt_0_iff le0 order_le_less_trans order_less_le_trans)
have a: ‹card U choose subsample_size > 0›
using subsample_size assms by (intro zero_less_binomial) auto
show b:‹subsample U = pmf_of_set ?C›
using subsample_size_eq unfolding subsample.simps
by (intro arg_cong[where f=‹pmf_of_set›] Collect_cong) auto
with assms subsample_size have ‹card ?C > 0›
using n_subsets[OF fin_U] by simp
thus ‹?C ≠ {}› ‹finite ?C› using card_gt_0_iff by blast+
thus ‹finite (set_pmf (subsample U))› unfolding b by auto
qed
lemma int_prod_subsample_eq_prod_int:
fixes g :: ‹bool ⇒ real›
assumes ‹card U = n› ‹S ⊆ U› ‹range g ⊆ {0..}›
shows ‹(∫ω. (∏s∈S. g(s ∈ ω)) ∂subsample U) ≤ (∏s∈S. (∫ω. g ω ∂bernoulli_pmf f))› (is ‹?L ≤ ?R›)
proof -
define η where ‹η ≡ if g True ≥ g False then Fwd else Rev›
have fin_U: ‹finite U› using assms subsample_size
by (meson card_gt_0_iff le0 order_le_less_trans order_less_le_trans)
note subsample = subsample_finite_nonempty[OF assms(1)]
note [simp] = integrable_measure_pmf_finite[OF subsample(4)]
let ?C = ‹{T. T ⊆ U ∧ card T = subsample_size}›
have subssample_size_le_card_U: ‹subsample_size ≤ card U›
using subsample_size unfolding assms(1) by simp
have ‹measure_pmf.neg_assoc (subsample U) (λs ω. (s ∈ ω)) U›
using subssample_size_le_card_U unfolding subsample
by (intro n_subsets_distribution_neg_assoc fin_U)
hence na: ‹measure_pmf.neg_assoc (subsample U) (λs ω. (s ∈ ω)) S›
using measure_pmf.neg_assoc_subset[OF assms(2)] by auto
have fin_S: ‹finite S› using assms(2) fin_U finite_subset by auto
note na_imp_prod_mono = has_int_thatD(2)[OF measure_pmf.neg_assoc_imp_prod_mono[OF fin_S na]]
have g_borel: ‹g ∈ borel_measurable borel› by (intro borel_measurable_continuous_onI) simp
have g_mono_aux: ‹g x ≤≥⇘η⇙ g y› if ‹x ≤ y› for x y
unfolding η_def using that by simp (smt (verit, best))
have g_mono: ‹monotone (≤) (≤≥⇘η⇙) g›
by (intro monotoneI) (auto simp:dir_le_refl intro!:g_mono_aux)
have a: ‹map_pmf (λω. s ∈ ω) (subsample U) = bernoulli_pmf f› if ‹s ∈ U› for s
proof -
have ‹measure (pmf_of_set ?C) {x. s ∈ x} = real subsample_size / card U›
by (intro n_subsets_prob subssample_size_le_card_U that fin_U)
also have ‹… = f› unfolding subsample_size_eq assms(1) using n_gt_0 by auto
finally have ‹measure (pmf_of_set ?C) {x. s ∈ x} = f› by simp
thus ?thesis
unfolding subsample by (intro eq_bernoulli_pmfI) (simp add: pmf_map vimage_def)
qed
have ‹?L ≤ (∏s∈S. (∫ω. g (s ∈ ω) ∂subsample U))›
by (intro na_imp_prod_mono[OF _ g_mono] g_borel assms(3)) auto
also have ‹… = (∏s∈S. (∫ω. g ω ∂map_pmf (λω. s ∈ ω) (subsample U)))› by simp
also have ‹… = ?R› using a assms(2) by (intro prod.cong refl) (metis in_mono)
finally show ?thesis .
qed
schematic_goal step_n_def: ‹step x σ = ?x›
by (subst state.collapse[symmetric], subst step.simps, rule refl)
interpretation abs: cvm_algo_abstract n f subsample
rewrites ‹abs.run_steps = run_steps› and ‹abs.estimate = estimate›
proof -
show abs:‹cvm_algo_abstract n f subsample›
proof (unfold_locales, goal_cases)
case 1 thus ?case using subsample_size by auto
next
case 2 thus ?case using f_range by auto
next
case (3 U x) thus ?case using subsample_finite_nonempty[OF 3(1)] by simp
next
case (4 g U S) thus ?case by (intro int_prod_subsample_eq_prod_int) auto
qed
have a:‹(λx σ. cvm_algo_abstract.step_1 x σ ⤜ cvm_algo_abstract.step_2 n f subsample) = step›
unfolding cvm_algo_abstract.step_1_def[OF abs] cvm_algo_abstract.step_2_def[OF abs] step_n_def
by (intro ext) (simp add: bind_assoc_pmf Let_def bind_return_pmf Set.remove_def cong:if_cong)
have c:‹cvm_algo_abstract.initial_state = initial_state›
unfolding cvm_algo_abstract.initial_state_def[OF abs] initial_state_def by auto
show ‹cvm_algo_abstract.run_steps n f subsample = run_steps›
unfolding cvm_algo_abstract.run_steps_def[OF abs] run_steps.simps a c by simp
show ‹cvm_algo_abstract.estimate = estimate›
unfolding cvm_algo_abstract.estimate_def[OF abs]
by (intro ext) (metis estimate.simps state.collapse)
qed
theorem unbiasedness: ‹measure_pmf.expectation (run_algo xs) id = card (set xs)›
unfolding run_algo.simps integral_map_pmf using abs.unbiasedness by simp
theorem correctness:
assumes ‹ε ∈ {0<..<1::real}› ‹δ ∈ {0<..<1::real}›
assumes ‹real n ≥ 12 / ε⇧2 * ln (3 * real (length xs) / δ)›
defines ‹A ≡ real (card (set xs))›
shows ‹𝒫(R in run_algo xs. ¦R - A¦ > ε * A) ≤ δ›
using assms(3) unfolding A_def using abs.correctness[OF assms(1,2)] by auto
lemma space_usage:
‹AE σ in run_steps xs. card (state_χ σ) < n ∧ finite (state_χ σ)›
proof -
define ρ where ‹ρ = FinalState xs›
have ‹card (state_χ σ) < n + (case ρ of FinalState _ ⇒ 0 | IntermState _ _ ⇒ 1)›
if ‹σ ∈ set_pmf (abs.run_state_pmf ρ)› for σ
using that
proof (induction ρ arbitrary:σ rule:run_state_induct)
case 1
then show ?case using n_gt_0 by (simp add:initial_state_def)
next
case (2 xs x)
have ‹card (state_χ τ) < n ∧ finite (state_χ τ)›
if ‹τ ∈ set_pmf (abs.run_state_pmf (FinalState xs))› for τ
using 2(1) abs.state_χ_finite[where ρ=‹FinalState xs›] that by (simp add:AE_measure_pmf_iff)
thus ?case
using 2(2) unfolding abs.step_1_def abs.run_state_pmf.simps Let_def map_pmf_def[symmetric]
by (force simp:card_insert_if remove_def)
next
case (3 xs x)
define p where ‹p = abs.run_state_pmf (IntermState xs x)›
have a: ‹abs.run_state_pmf (FinalState (xs@[x])) = p ⤜ abs.step_2›
by (simp add:p_def abs.run_steps_snoc del:run_steps.simps)
have b:‹card χ < card (state_χ τ)›
if ‹card (state_χ τ) = n› ‹χ ∈ set_pmf (subsample (state_χ τ))› ‹τ ∈ set_pmf p› for χ τ
proof -
from subsample_finite_nonempty[OF that(1)]
have ‹card χ = subsample_size› using that unfolding subsample_def by auto
thus ?thesis using subsample_size(1) that by auto
qed
have ‹card (state_χ τ) < n ∨ card (state_χ τ) = n› ‹finite (state_χ τ)›
if ‹τ ∈ set_pmf p› for τ
using 3(1) abs.state_χ_finite[where ρ=‹IntermState xs x›] that unfolding p_def
by (auto simp:AE_measure_pmf_iff less_Suc_eq)
hence ‹card (state_χ σ) < n›
using 3(2) unfolding a abs.step_2_def Let_def by (auto intro!:b simp:if_distrib if_distribR)
thus ?case by simp
qed
thus ?thesis using abs.state_χ_finite[where ρ=‹FinalState xs›] unfolding ρ_def
by (simp add:AE_measure_pmf_iff)
qed
end
end