Theory CVM_New_Unbiased_Algorithm

section ‹The New Unbiased Algorithm\label{sec:cvm_new}›

text ‹In this section, we introduce the new algorithm variant promised in the abstract.

The main change is to replace the subsampling step of the original algorithm, which removes each
element of the buffer independently with probability $f$.
Instead, we choose a random $nf$-subset of the buffer (see Algorithm~\ref{alg:cvm_new}).
(This means $f$, $n$ must be chosen, such that $nf$ is an integer.)

\begin{algorithm}[h!]
	\caption{New CVM algorithm.\label{alg:cvm_new}}
	\begin{algorithmic}[1]
  \Require Stream elements $a_1,\ldots,a_l$, $0 < \varepsilon$, $0 < \delta < 1$, $f$ subsampling param.
  \Ensure An estimate $R$, s.t., $\prob \left( | R - |A| | > \varepsilon |A| \right) \leq \delta$ where $A := \{a_1,\ldots,a_l\}.$
  \State $\chi \gets \{\}, p \gets 1, n \geq \left\lceil \frac{12}{\varepsilon^2} \ln(\frac{3l}{\delta}) \right\rceil$
  \For{$i \gets 1$ to $l$}
    \State $b \getsr \Ber(p)$ \Comment insert $a_i$ with probability $p$ (and remove it otherwise)
    \If{$b$}
      \State $\chi \gets \chi \cup \{a_i\}$
    \Else
      \State $\chi \gets \chi - \{a_i\}$
    \EndIf
    \If{$|\chi| = n$}
      \State $\chi \getsr \mathrm{subsample}(\chi)$ \Comment Choose a random $nf$-subset of $\chi$
      \State $p \gets p f$
    \EndIf
  \EndFor
  \State \Return $\frac{|\chi|}{p}$ \Comment estimate cardinality of $A$
\end{algorithmic}
\end{algorithm}

The fact that this still preserves the required inequality for the subsampling operation
(Eq.~\ref{eq:subsample_condition}) follows from the negative associativity of permutation
distributions~\cite[Th. 10]{dubhashi1996}.

(See also our formalization of the concept~\cite{Negative_Association-AFP}.)

Because the subsampling step always removes elements unconditionally, the second check, whether
the subsampling succeeded of the original algorithm is not necessary anymore.

This improves the space usage of the algorithm, because the first reduction argument from
Section~\ref{sec:cvm_original} is now obsolete. Moreover the resulting algorithm is now
unbiased, because it is an instance of the abstract algorithm of Section~\ref{sec:cvm_abs}.›

theory CVM_New_Unbiased_Algorithm
  imports
    CVM_Abstract_Algorithm
    Probabilistic_Prime_Tests.Generalized_Primality_Test
    Negative_Association.Negative_Association_Permutation_Distributions
begin

unbundle no vec_syntax

context
  fixes f :: real and n :: nat
  assumes f_range: ‹f ∈ {1/2..<1}› ‹n * f ∈ ℕ› and n_gt_0: ‹n > 0›
begin

definition ‹initial_state = State {} 1› ― ‹Setup initial state $\chi=\emptyset$ and $p=1$.\;›
fun subsample where ― ‹Subsampling operation: Sample random $n f$ subset.\;›
  ‹subsample χ = pmf_of_set {S. S ⊆ χ ∧ card S = n * f}›

fun step where ― ‹Loop body.\;›
  ‹step a (State χ p) = do {
    b ← bernoulli_pmf p;
    let χ = (if b then χ ∪ {a} else χ - {a});

    if card χ = n then do {
      χ ← subsample χ;
      return_pmf (State χ (p * f))
    } else do {
      return_pmf (State χ p)
    }
   }›

fun run_steps where ― ‹Iterate loop over stream @{term xs}.\;›
  ‹run_steps xs = foldM_pmf step xs initial_state›

fun estimate where
  ‹estimate (State χ p) = card χ / p›

fun run_algo where ― ‹Run algorithm and estimate.\;›
  ‹run_algo xs = map_pmf estimate (run_steps xs)›

definition ‹subsample_size = (THE x. real x = n * f)›

declare subsample.simps [simp del]

lemma subsample_size_eq:
  ‹real subsample_size = n * f›
proof -
  obtain a where a_def:‹real a = real n * f› using f_range(2) by (metis Nats_cases)
  show ?thesis
    unfolding subsample_size_def using a_def
    by (rule theI2[where a=‹a›]) (use a_def in auto)
qed

lemma subsample_size:
  ‹subsample_size < n› ‹2 * subsample_size ≥ n›
proof (goal_cases)
  case 1
  have ‹real subsample_size < real n›
    unfolding subsample_size_eq using f_range(1) n_gt_0 by auto
  thus ?case by simp
next
  case 2
  have ‹real n ≤ 2 * real subsample_size›
    using f_range(1) n_gt_0 unfolding subsample_size_eq by auto
  thus ?case by simp
qed

lemma subsample_finite_nonempty:
  assumes ‹card U = n›
  shows
    ‹{T. T ⊆ U ∧ card T = subsample_size} ≠ {}› (is ‹?C ≠ {}›)
    ‹finite {T. T ⊆ U ∧ card T = subsample_size}›
    ‹subsample U = pmf_of_set {T. T ⊆ U ∧ card T = subsample_size}›
    ‹finite (set_pmf (subsample U))›
proof -
  have fin_U: ‹finite U› using assms subsample_size
    by (meson card_gt_0_iff le0 order_le_less_trans order_less_le_trans)
  have a: ‹card U choose subsample_size > 0›
    using subsample_size assms by (intro zero_less_binomial) auto
  show b:‹subsample U = pmf_of_set ?C›
    using subsample_size_eq unfolding subsample.simps
    by (intro arg_cong[where f=‹pmf_of_set›] Collect_cong) auto
  with assms subsample_size have ‹card ?C > 0›
    using n_subsets[OF fin_U] by simp
  thus ‹?C ≠ {}› ‹finite ?C› using card_gt_0_iff by blast+
  thus ‹finite (set_pmf (subsample U))› unfolding b by auto
qed

lemma int_prod_subsample_eq_prod_int:
  fixes g :: ‹bool ⇒ real›
  assumes ‹card U = n› ‹S ⊆ U› ‹range g ⊆ {0..}›
  shows ‹(∫ω. (∏s∈S. g(s ∈ ω)) ∂subsample U) ≤ (∏s∈S. (∫ω. g ω ∂bernoulli_pmf f))› (is ‹?L ≤ ?R›)
proof -
  define η where ‹η ≡ if g True ≥ g False then Fwd else Rev›

  have fin_U: ‹finite U› using assms subsample_size
    by (meson card_gt_0_iff le0 order_le_less_trans order_less_le_trans)

  note subsample = subsample_finite_nonempty[OF assms(1)]

  note [simp] = integrable_measure_pmf_finite[OF subsample(4)]

  let ?C = ‹{T. T ⊆ U ∧ card T = subsample_size}›

  have subssample_size_le_card_U: ‹subsample_size ≤ card U›
    using subsample_size unfolding assms(1) by simp

  have ‹measure_pmf.neg_assoc (subsample U) (λs ω. (s ∈ ω)) U›
    using subssample_size_le_card_U unfolding subsample
    by (intro n_subsets_distribution_neg_assoc fin_U)

  hence na: ‹measure_pmf.neg_assoc (subsample U) (λs ω. (s ∈ ω)) S›
    using measure_pmf.neg_assoc_subset[OF assms(2)] by auto

  have fin_S: ‹finite S› using assms(2) fin_U finite_subset by auto
  note na_imp_prod_mono = has_int_thatD(2)[OF measure_pmf.neg_assoc_imp_prod_mono[OF fin_S na]]

  have g_borel: ‹g ∈ borel_measurable borel› by (intro borel_measurable_continuous_onI) simp
  have g_mono_aux: ‹g x ≤≥⇘η⇙ g y› if  ‹x ≤ y› for x y
    unfolding η_def using that by simp (smt (verit, best))
  have g_mono: ‹monotone (≤) (≤≥⇘η⇙) g›
    by (intro monotoneI) (auto simp:dir_le_refl intro!:g_mono_aux)

  have a: ‹map_pmf (λω. s ∈ ω) (subsample U) = bernoulli_pmf f› if ‹s ∈ U› for s
  proof -
    have ‹measure (pmf_of_set ?C) {x. s ∈ x} = real subsample_size / card U›
      by (intro n_subsets_prob subssample_size_le_card_U that fin_U)
    also have ‹… = f› unfolding subsample_size_eq  assms(1) using n_gt_0 by auto
    finally have ‹measure (pmf_of_set ?C) {x. s ∈ x} = f› by simp
    thus ?thesis
      unfolding subsample by (intro eq_bernoulli_pmfI) (simp add: pmf_map vimage_def)
  qed

  have ‹?L ≤ (∏s∈S. (∫ω. g (s ∈ ω) ∂subsample U))›
    by (intro na_imp_prod_mono[OF _ g_mono] g_borel assms(3)) auto
  also have ‹… = (∏s∈S. (∫ω. g ω ∂map_pmf (λω. s ∈ ω) (subsample U)))› by simp
  also have ‹… = ?R› using a assms(2) by (intro prod.cong refl) (metis in_mono)
  finally show ?thesis .
qed

schematic_goal step_n_def: ‹step x σ = ?x›
  by (subst state.collapse[symmetric], subst step.simps, rule refl)

interpretation abs: cvm_algo_abstract n f subsample
  rewrites ‹abs.run_steps = run_steps› and ‹abs.estimate = estimate›
proof -
  show abs:‹cvm_algo_abstract n f subsample›
  proof (unfold_locales, goal_cases)
    case 1 thus ?case using subsample_size by auto
  next
    case 2 thus ?case using f_range by auto
  next
    case (3 U x) thus ?case using subsample_finite_nonempty[OF 3(1)] by simp
  next
    case (4 g U S) thus ?case by (intro int_prod_subsample_eq_prod_int) auto
  qed
  have a:‹(λx σ. cvm_algo_abstract.step_1 x σ ⤜ cvm_algo_abstract.step_2 n f subsample) = step›
    unfolding cvm_algo_abstract.step_1_def[OF abs]  cvm_algo_abstract.step_2_def[OF abs] step_n_def
    by (intro ext) (simp add: bind_assoc_pmf Let_def bind_return_pmf Set.remove_def cong:if_cong)
  have c:‹cvm_algo_abstract.initial_state = initial_state›
    unfolding cvm_algo_abstract.initial_state_def[OF abs] initial_state_def by auto
  show ‹cvm_algo_abstract.run_steps n f subsample = run_steps›
    unfolding cvm_algo_abstract.run_steps_def[OF abs] run_steps.simps a c by simp
  show ‹cvm_algo_abstract.estimate = estimate›
    unfolding cvm_algo_abstract.estimate_def[OF abs]
    by (intro ext) (metis estimate.simps state.collapse)
qed

theorem unbiasedness: ‹measure_pmf.expectation (run_algo xs) id = card (set xs)›
  unfolding run_algo.simps integral_map_pmf using abs.unbiasedness by simp

theorem correctness:
  assumes ‹ε ∈ {0<..<1::real}› ‹δ ∈ {0<..<1::real}›
  assumes ‹real n ≥ 12 / ε⇧² * ln (3 * real (length xs) / δ)›
  defines ‹A ≡ real (card (set xs))›
  shows ‹𝒫(R in run_algo xs. ¦R - A¦ > ε * A) ≤ δ›
  using assms(3) unfolding A_def using abs.correctness[OF assms(1,2)] by auto

lemma space_usage:
  ‹AE σ in run_steps xs. card (state_χ σ) < n ∧ finite (state_χ σ)›
proof -
  define ρ where ‹ρ = FinalState xs›
  have ‹card (state_χ σ) < n + (case ρ of FinalState _ ⇒ 0 | IntermState _ _ ⇒ 1)›
    if ‹σ ∈ set_pmf (abs.run_state_pmf ρ)› for σ
    using that
  proof (induction ρ arbitrary:σ rule:run_state_induct)
    case 1
    then show ?case using n_gt_0 by (simp add:initial_state_def)
  next
    case (2 xs x)
    have ‹card (state_χ τ) < n ∧ finite (state_χ τ)›
      if ‹τ ∈ set_pmf (abs.run_state_pmf (FinalState xs))› for τ
      using 2(1) abs.state_χ_finite[where ρ=‹FinalState xs›] that by (simp add:AE_measure_pmf_iff)
    thus ?case
      using 2(2) unfolding abs.step_1_def abs.run_state_pmf.simps Let_def map_pmf_def[symmetric]
      by (force simp:card_insert_if remove_def)
  next
    case (3 xs x)
    define p where ‹p = abs.run_state_pmf (IntermState xs x)›

    have a: ‹abs.run_state_pmf (FinalState (xs@[x])) = p ⤜ abs.step_2›
      by (simp add:p_def abs.run_steps_snoc del:run_steps.simps)

    have b:‹card χ < card (state_χ τ)›
      if ‹card (state_χ τ) = n› ‹χ ∈ set_pmf (subsample (state_χ τ))› ‹τ ∈ set_pmf p› for χ τ
    proof -
      from subsample_finite_nonempty[OF that(1)]
      have ‹card χ = subsample_size› using that unfolding subsample_def by auto
      thus ?thesis using subsample_size(1) that by auto
    qed

    have ‹card (state_χ τ) < n ∨ card (state_χ τ) = n› ‹finite (state_χ τ)›
      if ‹τ ∈ set_pmf p› for τ
      using 3(1) abs.state_χ_finite[where ρ=‹IntermState xs x›] that unfolding p_def
      by (auto simp:AE_measure_pmf_iff less_Suc_eq)
    hence ‹card (state_χ σ) < n›
      using 3(2) unfolding a abs.step_2_def Let_def by (auto intro!:b simp:if_distrib if_distribR)
    thus ?case by simp
  qed
  thus ?thesis using abs.state_χ_finite[where ρ=‹FinalState xs›] unfolding ρ_def
    by (simp add:AE_measure_pmf_iff)
qed

end (* context *)

end (* theory *)