Theory CVM_Original_Algorithm
section ‹The Original CVM Algorithm\label{sec:cvm_original}›
text ‹In this section, we verify the algorithm as presented by Chakrabory et
al.~\cite{chakraborty2022} (replicated, here, in Algorithm~\ref{alg:cvm_classic}),
with the following caveat:
In the original algorithm the elements are removed with probability $f := \frac{1}{2}$ in the
subsampling step. The version verified here allows for any $f \in [\frac{1}{2},e^{-1/12}]$.
\begin{algorithm}[h!]
\caption{Original CVM algorithm.\label{alg:cvm_classic}}
\begin{algorithmic}[1]
\Require Stream elements $a_1,\ldots,a_l$, $0 < \varepsilon$, $0 < \delta < 1$, $f$ subsampling param.
\Ensure An estimate $R$, s.t., $\prob \left( | R - |A| | > \varepsilon |A| \right) \leq \delta$ where $A := \{a_1,\ldots,a_l\}.$
\State $\chi \gets \{\}, p \gets 1, n \geq \left\lceil \frac{12}{\varepsilon^2} \ln(\frac{6l}{\delta}) \right\rceil$
\For{$i \gets 1$ to $l$}
\State $b \getsr \Ber(p)$ \Comment insert $a_i$ with probability $p$ (and remove it otherwise)
\If{$b$}
\State $\chi \gets \chi \cup \{a_i\}$
\Else
\State $\chi \gets \chi - \{a_i\}$
\EndIf
\If{$|\chi| = n$}
\State $\chi \getsr \mathrm{subsample}(\chi)$ \Comment keep each element of $\chi$ indep. with prob. $f$
\State $p \gets p f$
\EndIf
\If{$|\chi| = n$}
\State \Return $\bot$
\EndIf
\EndFor
\State \Return $\frac{|\chi|}{p}$ \Comment estimate cardinality of $A$
\end{algorithmic}
\end{algorithm}
The first step of the proof is identical to the original proof~\cite{chakraborty2022}, where the
above algorithm is approximated by a second algorithm, where lines 11--12 are removed, i.e., the two
algorithms behave identically, unless the very improbable event---where the subsampling step fails
to remove any elements---occurs. It is possible to show that the total variational distance between
the two algorithms is at most $\frac{\delta}{2}$.
In the second step, we verify that the probability that the second algorithm returns an estimate
outside of the desired interval is also at most $\frac{\delta}{2}$. This, of course, works by
noticing that it is an instance of the abstract algorithm we introduced in
Section~\ref{sec:cvm_abs}. In combination, we conclude a failure probability of $\delta$ for the
unmodified version of the algorithm.
On the other hand, the fact that the number of elements in the buffer is at most $n$ can be seen
directly from Algorithm~\ref{alg:cvm_classic}.›
theory CVM_Original_Algorithm
imports CVM_Abstract_Algorithm
begin
context
fixes f :: real
fixes n :: nat
assumes f_range: ‹f ∈ {1/2..exp(-1/12)}›
assumes n_gt_0: ‹n > 0›
begin
text ‹Line 1:›
definition initial_state :: ‹'a state› where
‹initial_state = State {} 1›
text ‹Lines 3--7:›
fun step_1 :: ‹'a ⇒ 'a state ⇒ 'a state spmf› where
‹step_1 a (State χ p) =
do {
b ← bernoulli_pmf p;
let χ = (if b then χ ∪ {a} else χ - {a});
return_spmf (State χ p)
}›
definition subsample :: ‹'a set ⇒ 'a set spmf› where
‹subsample χ =
do {
keep_in_χ ← prod_pmf χ (λ_. bernoulli_pmf f);
return_spmf (Set.filter keep_in_χ χ)
}›
text ‹Lines 8--10:›
fun step_2 :: ‹'a state ⇒ 'a state spmf› where
‹step_2 (State χ p) =
do {
if card χ = n then do {
χ ← subsample χ;
return_spmf (State χ (p * f))
} else
return_spmf (State χ p)
}›
text ‹Lines 11--12:›
fun step_3 :: ‹'a state ⇒ 'a state spmf› where
‹step_3 (State χ p) =
do {
if card χ = n
then fail_spmf
else return_spmf (State χ p)
}›
text ‹Lines 1--12:›
definition run_steps :: ‹'a list ⇒ 'a state spmf› where
‹run_steps xs ≡ foldM_spmf (λx σ. step_1 x σ ⤜ step_2 ⤜ step_3) xs initial_state›
text ‹Line 13:›
definition estimate :: ‹'a state ⇒ real› where
‹estimate σ = card (state_χ σ) / state_p σ›
definition run_algo :: ‹'a list ⇒ real spmf› where
‹run_algo xs = map_spmf estimate (run_steps xs)›
schematic_goal step_1_m_def: ‹step_1 x σ = ?x›
by (subst state.collapse[symmetric], subst step_1.simps, rule refl)
schematic_goal step_2_m_def: ‹step_2 σ = ?x›
by (subst state.collapse[symmetric], subst step_2.simps, rule refl)
schematic_goal step_3_m_def: ‹step_3 σ = ?x›
by (subst state.collapse[symmetric], subst step_3.simps, rule refl)
lemma ord_spmf_remove_step3:
‹ord_spmf (=) (step_1 x σ ⤜ step_2 ⤜ step_3) (step_1 x σ ⤜ step_2)›
proof -
have ‹ord_spmf (=) (step_2 x ⤜ step_3) (step_2 x)› for x :: ‹'a state›
proof -
have ‹ord_spmf (=) (step_2 x ⤜ step_3) (step_2 x ⤜ return_spmf)›
by (intro bind_spmf_mono') (simp_all add:step_3_m_def)
thus ?thesis by simp
qed
thus ?thesis unfolding bind_spmf_assoc by (intro bind_spmf_mono') simp_all
qed
lemma ord_spmf_run_steps:
‹ord_spmf (=) (run_steps xs) (foldM_spmf (λx σ. step_1 x σ ⤜ step_2) xs initial_state)›
unfolding run_steps_def
proof (induction xs rule:rev_induct)
case Nil
then show ?case by simp
next
case (snoc x xs)
show ?case
unfolding run_steps_def foldM_spmf_snoc
by (intro ord_spmf_remove_step3 bind_spmf_mono' snoc)
qed
lemma f_range_simple: ‹f ∈ {1/2..<1}›
proof -
have ‹exp (- 1 / 12) < (1::real)› by (approximation 5)
from dual_order.strict_trans2[OF this]
show ?thesis using f_range by auto
qed
text ‹Main result:›
theorem correctness:
fixes xs :: ‹'a list›
assumes ‹ε ∈ {0<..<1}› ‹δ ∈ {0<..<1}›
assumes ‹real n ≥ 12 / ε⇧2 * ln (6 * real (length xs) / δ)›
defines ‹A ≡ real (card (set xs))›
shows ‹𝒫(ω in run_algo xs. fails_or_satisfies (λR. ¦R - A¦ > ε * A) ω) ≤ δ›
(is ‹?L ≤ ?R›)
proof -
define abs_subsample where
‹abs_subsample χ = map_pmf (λω. Set.filter ω χ) (prod_pmf χ (λ_. bernoulli_pmf f))›
for χ :: ‹'a set›
interpret abs:cvm_algo_abstract n f abs_subsample
rewrites ‹abs.estimate = estimate›
proof -
show abs:‹cvm_algo_abstract n f abs_subsample›
proof (unfold_locales, goal_cases)
case 1 thus ?case by (rule n_gt_0)
next
case 2 thus ?case using f_range_simple by auto
next
case (3 U x)
then show ?case unfolding abs_subsample_def by auto
next
case (4 g χ S)
hence fin_U: ‹finite χ› using n_gt_0 card_gt_0_iff by metis
note conv = Pi_pmf_subset[OF this 4(1)]
have ‹(∫ω. (∏s∈S. g (s ∈ ω)) ∂abs_subsample χ) =
(∫ω. (∏s∈S. g (s ∈ χ ∧ ω s)) ∂prod_pmf χ (λ_. bernoulli_pmf f))›
unfolding abs_subsample_def by (simp cong:prod.cong)
also have ‹… = (∫ω. (∏s∈S. g (s ∈ χ ∧ ω s)) ∂prod_pmf S (λ_. bernoulli_pmf f))›
unfolding conv by simp
also have ‹… = (∏s∈S. (∫ω. g (s ∈ χ ∧ ω) ∂bernoulli_pmf f))›
using fin_U finite_subset[OF 4(1)]
by (intro expectation_prod_Pi_pmf integrable_measure_pmf_finite) auto
also have ‹… = (∏s∈S. (∫ω. g ω ∂bernoulli_pmf f))›
using 4(1) by (intro prod.cong refl) auto
finally show ?case by simp
qed
show ‹cvm_algo_abstract.estimate = (estimate :: 'a state ⇒ real)›
unfolding cvm_algo_abstract.estimate_def[OF abs] estimate_def by simp
qed
have a: ‹step_1 σ x = spmf_of_pmf (abs.step_1 σ x)› for σ x
unfolding step_1_m_def abs.step_1_def Let_def spmf_of_pmf_def by (simp add:map_bind_pmf)
have b: ‹step_2 σ = map_pmf Some (abs.step_2 σ)› for σ
unfolding step_2_m_def abs.step_2_def subsample_def abs_subsample_def Let_def
by (simp add:map_bind_pmf bind_pmf_return_spmf)
have c: ‹abs.initial_state = initial_state›
unfolding initial_state_def abs.initial_state_def by simp
have d: ‹subsample χ = spmf_of_pmf (abs_subsample χ)› for χ
unfolding subsample_def abs_subsample_def map_pmf_def[symmetric]
by (simp add:spmf_of_pmf_def map_pmf_comp)
define α :: real where ‹α = f ^ n›
have α_range: ‹α ∈ {0..1}›
using f_range_simple unfolding α_def by (auto intro:power_le_one)
hence [simp]: ‹¦α¦ ≤ 1› by auto
have ‹(∫x. (if card x = n then 1 else 0) ∂abs_subsample χ) ≤ α› (is ‹?L1 ≤ _›)
if that': ‹card χ = n› for χ
proof -
have fin_U: ‹finite χ› using n_gt_0 that card_gt_0_iff by metis
have ‹(∏s∈χ. of_bool (s ∈ x)::real) = of_bool(card x = n)›
if ‹x ∈ set_pmf (abs_subsample χ)› for x
proof -
have x_ran: ‹x ⊆ χ› using that unfolding abs_subsample_def by auto
have ‹(∏s∈χ. of_bool (s ∈ x)::real) = of_bool(x = χ)›
using fin_U x_ran by (induction χ rule:finite_induct) auto
also have ‹… = of_bool (card x = card χ)›
using x_ran fin_U card_subset_eq by (intro arg_cong[where f=‹of_bool›]) blast
also have ‹… = of_bool (card x = n)› using that' by simp
finally show ?thesis by auto
qed
hence ‹?L1 = (∫x. (∏s ∈ χ. of_bool(s ∈ x)) ∂abs_subsample χ)›
by (intro integral_cong_AE AE_pmfI) simp_all
also have ‹… ≤ (∏s ∈ χ. (∫x. of_bool x ∂bernoulli_pmf f))›
by (intro abs.subsample_inequality that) auto
also have ‹… = f ^ card χ› using f_range_simple by simp
also have ‹… = α› unfolding α_def that by simp
finally show ?thesis by simp
qed
hence e: ‹pmf (step_2 σ ⤜ step_3) None ≤ α› for σ :: ‹'a state›
using α_range unfolding step_2_m_def step_3_m_def d Let_def
by (simp add:pmf_bind bind_pmf_return_spmf if_distrib if_distribR cong:if_cong)
have ‹pmf (step_1 x σ ⤜ step_2 ⤜ step_3) None ≤ α› for σ and x :: 'a
proof-
have ‹pmf (step_1 x σ ⤜ step_2 ⤜ step_3) None ≤ 0 + (∫_. α ∂ measure_spmf (step_1 x σ))›
unfolding bind_spmf_assoc pmf_bind_spmf_None[where p=‹step_1 x σ›]
by (intro add_mono integral_mono_AE measure_spmf.integrable_const_bound[where B=‹1›]
iffD2[OF AE_measure_spmf_iff] ballI e)
(simp_all add:pmf_le_1 step_1_m_def map_pmf_def[symmetric] pmf_map vimage_def Let_def)
also have ‹… ≤ α› using α_range by (simp add: mult_left_le_one_le weight_spmf_le_1)
finally show ?thesis by simp
qed
hence ‹prob_fail (run_steps xs) ≤ length xs * α›
unfolding run_steps_def by (intro prob_fail_foldM_spmf_le[where P=‹λ_. True›]) auto
also have ‹… ≤ δ / 2›
proof (cases ‹xs = []›)
case True
thus ?thesis using assms(2) by auto
next
case False
have ‹δ ≤ 6 * 1› using assms(2) by simp
also have ‹… ≤ 6 * real (length xs)›
using False by (intro mult_mono order.refl) (cases xs, auto)
finally have [simp]: ‹ δ ≤ 6 * real (length xs)› by simp
have ‹2 * real (length xs) * f ^ n ≤ 2 * real (length xs) * exp (-1/12)^n›
using f_range by (intro mult_left_mono power_mono) auto
also have ‹… = 2 * real (length xs) * exp (-real n / 12)›
unfolding exp_of_nat_mult[symmetric] by simp
also have ‹… ≤ 2 * real (length xs) * exp (-(12 / ε ^ 2 * ln (6 * real (length xs) / δ))/12)›
using assms(3) by (intro mult_left_mono iffD2[OF exp_le_cancel_iff] divide_right_mono) auto
also have ‹… = 2 * real (length xs) * exp (-ln (6 * real (length xs) / δ) / ε^2 )›
by auto
also have ‹… ≤ 2 * real (length xs) * exp (-ln (6 * real (length xs) / δ) / 1 )›
using assms(1,2) False
by (intro mult_left_mono iffD2[OF exp_le_cancel_iff] divide_left_mono_neg power_le_one)
(auto intro!:ln_ge_zero simp:divide_simps)
also have ‹… = 2 * real (length xs) * exp (ln (inverse (6 * real (length xs) / δ)))›
using False assms(2) by (subst ln_inverse[symmetric]) auto
also have ‹… = 2 * real (length xs) / (6 * real (length xs) / δ)›
using assms(1,2) False by (subst exp_ln) auto
also have ‹… = δ / 3› using False assms(2) by auto
also have ‹… ≤ δ› using assms(2) by auto
finally have ‹2 * real (length xs) * f^n ≤ δ› by simp
thus ?thesis unfolding α_def by simp
qed
finally have f:‹prob_fail (run_steps xs) ≤ δ / 2› by simp
have g:‹spmf_of_pmf (abs.run_steps xs) = foldM_spmf (λx σ. step_1 x σ ⤜ step_2) xs initial_state›
unfolding abs.run_steps_def foldM_spmf_of_pmf_eq(2)[symmetric]
unfolding spmf_of_pmf_def map_pmf_def c b a
by (simp add:bind_assoc_pmf bind_spmf_def bind_return_pmf)
have ‹?L ≤ measure (run_steps xs) {None} +
measure (measure_spmf (run_steps xs)) {x. ¦estimate x - A¦ > ε * A}›
unfolding run_algo_def measure_measure_spmf_conv_measure_pmf measure_map_pmf
by (intro pmf_add) (auto split:option.split_asm)
also have ‹… ≤ δ / 2 + measure (measure_spmf (run_steps xs)) {x. ¦estimate x - A¦ > ε * A}›
unfolding measure_pmf_single by (intro add_mono f order.refl)
also have ‹… ≤ δ/2+measure(measure_spmf (spmf_of_pmf (abs.run_steps xs))) {x. ¦estimate x-A¦>ε*A}›
using ord_spmf_eqD_emeasure[OF ord_spmf_run_steps] unfolding measure_spmf.emeasure_eq_measure g
by (intro add_mono) auto
also have ‹… ≤ δ / 2 + measure (abs.run_steps xs) {x. ¦estimate x - A¦ > ε * A}›
using measure_spmf_map_pmf_Some spmf_of_pmf_def by auto
also have ‹… ≤ δ / 2 + δ / 2›
using assms(1-3) unfolding A_def by (intro add_mono abs.correctness) auto
finally show ?thesis by simp
qed
lemma space_usage:
‹AE σ in measure_spmf (run_steps xs). card (state_χ σ) < n ∧ finite (state_χ σ)›
proof (induction xs rule:rev_induct)
case Nil thus ?case using n_gt_0 by (simp add:run_steps_def initial_state_def)
next
case (snoc x xs)
define p1 where ‹p1 = run_steps xs ⤜ step_1 x›
define p2 where ‹p2 = p1 ⤜ step_2›
define p3 where ‹p3 = p2 ⤜ step_3›
have a:‹run_steps (xs@[x]) = p3›
unfolding run_steps_def p1_def p2_def p3_def foldM_spmf_snoc by (simp add:bind_assoc_pmf)
have ‹card (state_χ σ) ≤ n ∧ finite (state_χ σ)› if ‹σ ∈ set_spmf p1› for σ
using snoc that less_imp_le unfolding p1_def
by (auto simp: step_1_m_def set_bind_spmf set_spmf_bind_pmf Let_def card_insert_if)+
hence ‹card (state_χ σ) ≤ n ∧ finite (state_χ σ)› if ‹σ ∈ set_spmf p2› for σ
using that card_filter_mono unfolding p2_def
by (auto intro!:card_filter_mono simp:step_2_m_def set_bind_spmf set_spmf_bind_pmf
subsample_def Let_def if_distrib)
hence ‹card (state_χ σ) < n ∧ finite (state_χ σ)› if ‹σ ∈ set_spmf p3› for σ
using that unfolding p3_def
by (auto intro:le_neq_implies_less simp:step_3_m_def set_bind_spmf if_distrib)
thus ?case unfolding a by simp
qed
end
end