170 |
170 |
171 text {* The Kullback$-$Leibler divergence is also known as relative entropy or |
171 text {* The Kullback$-$Leibler divergence is also known as relative entropy or |
172 Kullback$-$Leibler distance. *} |
172 Kullback$-$Leibler distance. *} |
173 |
173 |
174 definition |
174 definition |
175 "entropy_density b M \<nu> = log b \<circ> real \<circ> RN_deriv M \<nu>" |
175 "entropy_density b M N = log b \<circ> real \<circ> RN_deriv M N" |
176 |
176 |
177 definition |
177 definition |
178 "KL_divergence b M \<nu> = integral\<^isup>L (M\<lparr>measure := \<nu>\<rparr>) (entropy_density b M \<nu>)" |
178 "KL_divergence b M N = integral\<^isup>L N (entropy_density b M N)" |
179 |
179 |
180 lemma (in information_space) measurable_entropy_density: |
180 lemma (in information_space) measurable_entropy_density: |
181 assumes ps: "prob_space (M\<lparr>measure := \<nu>\<rparr>)" |
181 assumes ac: "absolutely_continuous M N" "sets N = events" |
182 assumes ac: "absolutely_continuous \<nu>" |
182 shows "entropy_density b M N \<in> borel_measurable M" |
183 shows "entropy_density b M \<nu> \<in> borel_measurable M" |
183 proof - |
184 proof - |
184 from borel_measurable_RN_deriv[OF ac] b_gt_1 show ?thesis |
185 interpret \<nu>: prob_space "M\<lparr>measure := \<nu>\<rparr>" by fact |
|
186 have "measure_space (M\<lparr>measure := \<nu>\<rparr>)" by fact |
|
187 from RN_deriv[OF this ac] b_gt_1 show ?thesis |
|
188 unfolding entropy_density_def |
185 unfolding entropy_density_def |
189 by (intro measurable_comp) auto |
186 by (intro measurable_comp) auto |
190 qed |
187 qed |
191 |
188 |
|
189 lemma (in sigma_finite_measure) KL_density: |
|
190 fixes f :: "'a \<Rightarrow> real" |
|
191 assumes "1 < b" |
|
192 assumes f: "f \<in> borel_measurable M" "AE x in M. 0 \<le> f x" |
|
193 shows "KL_divergence b M (density M f) = (\<integral>x. f x * log b (f x) \<partial>M)" |
|
194 unfolding KL_divergence_def |
|
195 proof (subst integral_density) |
|
196 show "entropy_density b M (density M (\<lambda>x. ereal (f x))) \<in> borel_measurable M" |
|
197 using f `1 < b` |
|
198 by (auto simp: comp_def entropy_density_def intro!: borel_measurable_log borel_measurable_RN_deriv_density) |
|
199 have "density M (RN_deriv M (density M f)) = density M f" |
|
200 using f by (intro density_RN_deriv_density) auto |
|
201 then have eq: "AE x in M. RN_deriv M (density M f) x = f x" |
|
202 using f |
|
203 by (intro density_unique) |
|
204 (auto intro!: borel_measurable_log borel_measurable_RN_deriv_density simp: RN_deriv_density_nonneg) |
|
205 show "(\<integral>x. f x * entropy_density b M (density M (\<lambda>x. ereal (f x))) x \<partial>M) = (\<integral>x. f x * log b (f x) \<partial>M)" |
|
206 apply (intro integral_cong_AE) |
|
207 using eq |
|
208 apply eventually_elim |
|
209 apply (auto simp: entropy_density_def) |
|
210 done |
|
211 qed fact+ |
|
212 |
|
213 lemma (in sigma_finite_measure) KL_density_density: |
|
214 fixes f g :: "'a \<Rightarrow> real" |
|
215 assumes "1 < b" |
|
216 assumes f: "f \<in> borel_measurable M" "AE x in M. 0 \<le> f x" |
|
217 assumes g: "g \<in> borel_measurable M" "AE x in M. 0 \<le> g x" |
|
218 assumes ac: "AE x in M. f x = 0 \<longrightarrow> g x = 0" |
|
219 shows "KL_divergence b (density M f) (density M g) = (\<integral>x. g x * log b (g x / f x) \<partial>M)" |
|
220 proof - |
|
221 interpret Mf: sigma_finite_measure "density M f" |
|
222 using f by (subst sigma_finite_iff_density_finite) auto |
|
223 have "KL_divergence b (density M f) (density M g) = |
|
224 KL_divergence b (density M f) (density (density M f) (\<lambda>x. g x / f x))" |
|
225 using f g ac by (subst density_density_divide) simp_all |
|
226 also have "\<dots> = (\<integral>x. (g x / f x) * log b (g x / f x) \<partial>density M f)" |
|
227 using f g `1 < b` by (intro Mf.KL_density) (auto simp: AE_density divide_nonneg_nonneg) |
|
228 also have "\<dots> = (\<integral>x. g x * log b (g x / f x) \<partial>M)" |
|
229 using ac f g `1 < b` by (subst integral_density) (auto intro!: integral_cong_AE) |
|
230 finally show ?thesis . |
|
231 qed |
|
232 |
192 lemma (in information_space) KL_gt_0: |
233 lemma (in information_space) KL_gt_0: |
193 assumes ps: "prob_space (M\<lparr>measure := \<nu>\<rparr>)" |
234 fixes D :: "'a \<Rightarrow> real" |
194 assumes ac: "absolutely_continuous \<nu>" |
235 assumes "prob_space (density M D)" |
195 assumes int: "integrable (M\<lparr> measure := \<nu> \<rparr>) (entropy_density b M \<nu>)" |
236 assumes D: "D \<in> borel_measurable M" "AE x in M. 0 \<le> D x" |
196 assumes A: "A \<in> sets M" "\<nu> A \<noteq> \<mu> A" |
237 assumes int: "integrable M (\<lambda>x. D x * log b (D x))" |
197 shows "0 < KL_divergence b M \<nu>" |
238 assumes A: "density M D \<noteq> M" |
198 proof - |
239 shows "0 < KL_divergence b M (density M D)" |
199 interpret \<nu>: prob_space "M\<lparr>measure := \<nu>\<rparr>" by fact |
240 proof - |
200 have ms: "measure_space (M\<lparr>measure := \<nu>\<rparr>)" by default |
241 interpret N: prob_space "density M D" by fact |
201 have fms: "finite_measure (M\<lparr>measure := \<nu>\<rparr>)" by unfold_locales |
242 |
202 note RN = RN_deriv[OF ms ac] |
243 obtain A where "A \<in> sets M" "emeasure (density M D) A \<noteq> emeasure M A" |
203 |
244 using measure_eqI[of "density M D" M] `density M D \<noteq> M` by auto |
204 from real_RN_deriv[OF fms ac] guess D . note D = this |
245 |
205 with absolutely_continuous_AE[OF ms] ac |
246 let ?D_set = "{x\<in>space M. D x \<noteq> 0}" |
206 have D\<nu>: "AE x in M\<lparr>measure := \<nu>\<rparr>. RN_deriv M \<nu> x = ereal (D x)" |
247 have [simp, intro]: "?D_set \<in> sets M" |
207 by auto |
248 using D by auto |
208 |
|
209 def f \<equiv> "\<lambda>x. if D x = 0 then 1 else 1 / D x" |
|
210 with D have f_borel: "f \<in> borel_measurable M" |
|
211 by (auto intro!: measurable_If) |
|
212 |
|
213 have "KL_divergence b M \<nu> = 1 / ln b * (\<integral> x. ln b * entropy_density b M \<nu> x \<partial>M\<lparr>measure := \<nu>\<rparr>)" |
|
214 unfolding KL_divergence_def using int b_gt_1 |
|
215 by (simp add: integral_cmult) |
|
216 |
|
217 { fix A assume "A \<in> sets M" |
|
218 with RN D have "\<nu>.\<mu> A = (\<integral>\<^isup>+ x. ereal (D x) * indicator A x \<partial>M)" |
|
219 by (auto intro!: positive_integral_cong_AE) } |
|
220 note D_density = this |
|
221 |
|
222 have ln_entropy: "(\<lambda>x. ln b * entropy_density b M \<nu> x) \<in> borel_measurable M" |
|
223 using measurable_entropy_density[OF ps ac] by auto |
|
224 |
|
225 have "integrable (M\<lparr>measure := \<nu>\<rparr>) (\<lambda>x. ln b * entropy_density b M \<nu> x)" |
|
226 using int by auto |
|
227 moreover have "integrable (M\<lparr>measure := \<nu>\<rparr>) (\<lambda>x. ln b * entropy_density b M \<nu> x) \<longleftrightarrow> |
|
228 integrable M (\<lambda>x. D x * (ln b * entropy_density b M \<nu> x))" |
|
229 using D D_density ln_entropy |
|
230 by (intro integral_translated_density) auto |
|
231 ultimately have M_int: "integrable M (\<lambda>x. D x * (ln b * entropy_density b M \<nu> x))" |
|
232 by simp |
|
233 |
249 |
234 have D_neg: "(\<integral>\<^isup>+ x. ereal (- D x) \<partial>M) = 0" |
250 have D_neg: "(\<integral>\<^isup>+ x. ereal (- D x) \<partial>M) = 0" |
235 using D by (subst positive_integral_0_iff_AE) auto |
251 using D by (subst positive_integral_0_iff_AE) auto |
236 |
252 |
237 have "(\<integral>\<^isup>+ x. ereal (D x) \<partial>M) = \<nu> (space M)" |
253 have "(\<integral>\<^isup>+ x. ereal (D x) \<partial>M) = emeasure (density M D) (space M)" |
238 using RN D by (auto intro!: positive_integral_cong_AE) |
254 using D by (simp add: emeasure_density cong: positive_integral_cong) |
239 then have D_pos: "(\<integral>\<^isup>+ x. ereal (D x) \<partial>M) = 1" |
255 then have D_pos: "(\<integral>\<^isup>+ x. ereal (D x) \<partial>M) = 1" |
240 using \<nu>.measure_space_1 by simp |
256 using N.emeasure_space_1 by simp |
241 |
257 |
242 have "integrable M D" |
258 have "integrable M D" "integral\<^isup>L M D = 1" |
243 using D_pos D_neg D by (auto simp: integrable_def) |
259 using D D_pos D_neg unfolding integrable_def lebesgue_integral_def by simp_all |
244 |
260 |
245 have "integral\<^isup>L M D = 1" |
261 have "0 \<le> 1 - measure M ?D_set" |
246 using D_pos D_neg by (auto simp: lebesgue_integral_def) |
|
247 |
|
248 let ?D_set = "{x\<in>space M. D x \<noteq> 0}" |
|
249 have [simp, intro]: "?D_set \<in> sets M" |
|
250 using D by (auto intro: sets_Collect) |
|
251 |
|
252 have "0 \<le> 1 - \<mu>' ?D_set" |
|
253 using prob_le_1 by (auto simp: field_simps) |
262 using prob_le_1 by (auto simp: field_simps) |
254 also have "\<dots> = (\<integral> x. D x - indicator ?D_set x \<partial>M)" |
263 also have "\<dots> = (\<integral> x. D x - indicator ?D_set x \<partial>M)" |
255 using `integrable M D` `integral\<^isup>L M D = 1` |
264 using `integrable M D` `integral\<^isup>L M D = 1` |
256 by (simp add: \<mu>'_def) |
265 by (simp add: emeasure_eq_measure) |
257 also have "\<dots> < (\<integral> x. D x * (ln b * entropy_density b M \<nu> x) \<partial>M)" |
266 also have "\<dots> < (\<integral> x. D x * (ln b * log b (D x)) \<partial>M)" |
258 proof (rule integral_less_AE) |
267 proof (rule integral_less_AE) |
259 show "integrable M (\<lambda>x. D x - indicator ?D_set x)" |
268 show "integrable M (\<lambda>x. D x - indicator ?D_set x)" |
260 using `integrable M D` |
269 using `integrable M D` |
261 by (intro integral_diff integral_indicator) auto |
270 by (intro integral_diff integral_indicator) auto |
262 next |
271 next |
263 show "integrable M (\<lambda>x. D x * (ln b * entropy_density b M \<nu> x))" |
272 from integral_cmult(1)[OF int, of "ln b"] |
264 by fact |
273 show "integrable M (\<lambda>x. D x * (ln b * log b (D x)))" |
|
274 by (simp add: ac_simps) |
265 next |
275 next |
266 show "\<mu> {x\<in>space M. D x \<noteq> 1 \<and> D x \<noteq> 0} \<noteq> 0" |
276 show "emeasure M {x\<in>space M. D x \<noteq> 1 \<and> D x \<noteq> 0} \<noteq> 0" |
267 proof |
277 proof |
268 assume eq_0: "\<mu> {x\<in>space M. D x \<noteq> 1 \<and> D x \<noteq> 0} = 0" |
278 assume eq_0: "emeasure M {x\<in>space M. D x \<noteq> 1 \<and> D x \<noteq> 0} = 0" |
269 then have disj: "AE x. D x = 1 \<or> D x = 0" |
279 then have disj: "AE x in M. D x = 1 \<or> D x = 0" |
270 using D(1) by (auto intro!: AE_I[OF subset_refl] sets_Collect) |
280 using D(1) by (auto intro!: AE_I[OF subset_refl] sets_Collect) |
271 |
281 |
272 have "\<mu> {x\<in>space M. D x = 1} = (\<integral>\<^isup>+ x. indicator {x\<in>space M. D x = 1} x \<partial>M)" |
282 have "emeasure M {x\<in>space M. D x = 1} = (\<integral>\<^isup>+ x. indicator {x\<in>space M. D x = 1} x \<partial>M)" |
273 using D(1) by auto |
283 using D(1) by auto |
274 also have "\<dots> = (\<integral>\<^isup>+ x. ereal (D x) * indicator {x\<in>space M. D x \<noteq> 0} x \<partial>M)" |
284 also have "\<dots> = (\<integral>\<^isup>+ x. ereal (D x) \<partial>M)" |
275 using disj by (auto intro!: positive_integral_cong_AE simp: indicator_def one_ereal_def) |
285 using disj by (auto intro!: positive_integral_cong_AE simp: indicator_def one_ereal_def) |
276 also have "\<dots> = \<nu> {x\<in>space M. D x \<noteq> 0}" |
286 finally have "AE x in M. D x = 1" |
277 using D(1) D_density by auto |
287 using D D_pos by (intro AE_I_eq_1) auto |
278 also have "\<dots> = \<nu> (space M)" |
|
279 using D_density D(1) by (auto intro!: positive_integral_cong simp: indicator_def) |
|
280 finally have "AE x. D x = 1" |
|
281 using D(1) \<nu>.measure_space_1 by (intro AE_I_eq_1) auto |
|
282 then have "(\<integral>\<^isup>+x. indicator A x\<partial>M) = (\<integral>\<^isup>+x. ereal (D x) * indicator A x\<partial>M)" |
288 then have "(\<integral>\<^isup>+x. indicator A x\<partial>M) = (\<integral>\<^isup>+x. ereal (D x) * indicator A x\<partial>M)" |
283 by (intro positive_integral_cong_AE) (auto simp: one_ereal_def[symmetric]) |
289 by (intro positive_integral_cong_AE) (auto simp: one_ereal_def[symmetric]) |
284 also have "\<dots> = \<nu> A" |
290 also have "\<dots> = density M D A" |
285 using `A \<in> sets M` D_density by simp |
291 using `A \<in> sets M` D by (simp add: emeasure_density) |
286 finally show False using `A \<in> sets M` `\<nu> A \<noteq> \<mu> A` by simp |
292 finally show False using `A \<in> sets M` `emeasure (density M D) A \<noteq> emeasure M A` by simp |
287 qed |
293 qed |
288 show "{x\<in>space M. D x \<noteq> 1 \<and> D x \<noteq> 0} \<in> sets M" |
294 show "{x\<in>space M. D x \<noteq> 1 \<and> D x \<noteq> 0} \<in> sets M" |
289 using D(1) by (auto intro: sets_Collect) |
295 using D(1) by (auto intro: sets_Collect_conj) |
290 |
296 |
291 show "AE t. t \<in> {x\<in>space M. D x \<noteq> 1 \<and> D x \<noteq> 0} \<longrightarrow> |
297 show "AE t in M. t \<in> {x\<in>space M. D x \<noteq> 1 \<and> D x \<noteq> 0} \<longrightarrow> |
292 D t - indicator ?D_set t \<noteq> D t * (ln b * entropy_density b M \<nu> t)" |
298 D t - indicator ?D_set t \<noteq> D t * (ln b * log b (D t))" |
293 using D(2) |
299 using D(2) |
294 proof (elim AE_mp, safe intro!: AE_I2) |
300 proof (eventually_elim, safe) |
295 fix t assume Dt: "t \<in> space M" "D t \<noteq> 1" "D t \<noteq> 0" |
301 fix t assume Dt: "t \<in> space M" "D t \<noteq> 1" "D t \<noteq> 0" "0 \<le> D t" |
296 and RN: "RN_deriv M \<nu> t = ereal (D t)" |
302 and eq: "D t - indicator ?D_set t = D t * (ln b * log b (D t))" |
297 and eq: "D t - indicator ?D_set t = D t * (ln b * entropy_density b M \<nu> t)" |
|
298 |
303 |
299 have "D t - 1 = D t - indicator ?D_set t" |
304 have "D t - 1 = D t - indicator ?D_set t" |
300 using Dt by simp |
305 using Dt by simp |
301 also note eq |
306 also note eq |
302 also have "D t * (ln b * entropy_density b M \<nu> t) = - D t * ln (1 / D t)" |
307 also have "D t * (ln b * log b (D t)) = - D t * ln (1 / D t)" |
303 using RN b_gt_1 `D t \<noteq> 0` `0 \<le> D t` |
308 using b_gt_1 `D t \<noteq> 0` `0 \<le> D t` |
304 by (simp add: entropy_density_def log_def ln_div less_le) |
309 by (simp add: log_def ln_div less_le) |
305 finally have "ln (1 / D t) = 1 / D t - 1" |
310 finally have "ln (1 / D t) = 1 / D t - 1" |
306 using `D t \<noteq> 0` by (auto simp: field_simps) |
311 using `D t \<noteq> 0` by (auto simp: field_simps) |
307 from ln_eq_minus_one[OF _ this] `D t \<noteq> 0` `0 \<le> D t` `D t \<noteq> 1` |
312 from ln_eq_minus_one[OF _ this] `D t \<noteq> 0` `0 \<le> D t` `D t \<noteq> 1` |
308 show False by auto |
313 show False by auto |
309 qed |
314 qed |
310 |
315 |
311 show "AE t. D t - indicator ?D_set t \<le> D t * (ln b * entropy_density b M \<nu> t)" |
316 show "AE t in M. D t - indicator ?D_set t \<le> D t * (ln b * log b (D t))" |
312 using D(2) |
317 using D(2) AE_space |
313 proof (elim AE_mp, intro AE_I2 impI) |
318 proof eventually_elim |
314 fix t assume "t \<in> space M" and RN: "RN_deriv M \<nu> t = ereal (D t)" |
319 fix t assume "t \<in> space M" "0 \<le> D t" |
315 show "D t - indicator ?D_set t \<le> D t * (ln b * entropy_density b M \<nu> t)" |
320 show "D t - indicator ?D_set t \<le> D t * (ln b * log b (D t))" |
316 proof cases |
321 proof cases |
317 assume asm: "D t \<noteq> 0" |
322 assume asm: "D t \<noteq> 0" |
318 then have "0 < D t" using `0 \<le> D t` by auto |
323 then have "0 < D t" using `0 \<le> D t` by auto |
319 then have "0 < 1 / D t" by auto |
324 then have "0 < 1 / D t" by auto |
320 have "D t - indicator ?D_set t \<le> - D t * (1 / D t - 1)" |
325 have "D t - indicator ?D_set t \<le> - D t * (1 / D t - 1)" |
321 using asm `t \<in> space M` by (simp add: field_simps) |
326 using asm `t \<in> space M` by (simp add: field_simps) |
322 also have "- D t * (1 / D t - 1) \<le> - D t * ln (1 / D t)" |
327 also have "- D t * (1 / D t - 1) \<le> - D t * ln (1 / D t)" |
323 using ln_le_minus_one `0 < 1 / D t` by (intro mult_left_mono_neg) auto |
328 using ln_le_minus_one `0 < 1 / D t` by (intro mult_left_mono_neg) auto |
324 also have "\<dots> = D t * (ln b * entropy_density b M \<nu> t)" |
329 also have "\<dots> = D t * (ln b * log b (D t))" |
325 using `0 < D t` RN b_gt_1 |
330 using `0 < D t` b_gt_1 |
326 by (simp_all add: log_def ln_div entropy_density_def) |
331 by (simp_all add: log_def ln_div) |
327 finally show ?thesis by simp |
332 finally show ?thesis by simp |
328 qed simp |
333 qed simp |
329 qed |
334 qed |
330 qed |
335 qed |
331 also have "\<dots> = (\<integral> x. ln b * entropy_density b M \<nu> x \<partial>M\<lparr>measure := \<nu>\<rparr>)" |
336 also have "\<dots> = (\<integral> x. ln b * (D x * log b (D x)) \<partial>M)" |
332 using D D_density ln_entropy |
337 by (simp add: ac_simps) |
333 by (intro integral_translated_density[symmetric]) auto |
338 also have "\<dots> = ln b * (\<integral> x. D x * log b (D x) \<partial>M)" |
334 also have "\<dots> = ln b * (\<integral> x. entropy_density b M \<nu> x \<partial>M\<lparr>measure := \<nu>\<rparr>)" |
339 using int by (rule integral_cmult) |
335 using int by (rule \<nu>.integral_cmult) |
340 finally show ?thesis |
336 finally show "0 < KL_divergence b M \<nu>" |
341 using b_gt_1 D by (subst KL_density) (auto simp: zero_less_mult_iff) |
337 using b_gt_1 by (auto simp: KL_divergence_def zero_less_mult_iff) |
342 qed |
338 qed |
343 |
339 |
344 lemma (in sigma_finite_measure) KL_same_eq_0: "KL_divergence b M M = 0" |
340 lemma (in sigma_finite_measure) KL_eq_0: |
345 proof - |
341 assumes eq: "\<forall>A\<in>sets M. \<nu> A = measure M A" |
346 have "AE x in M. 1 = RN_deriv M M x" |
342 shows "KL_divergence b M \<nu> = 0" |
|
343 proof - |
|
344 have "AE x. 1 = RN_deriv M \<nu> x" |
|
345 proof (rule RN_deriv_unique) |
347 proof (rule RN_deriv_unique) |
346 show "measure_space (M\<lparr>measure := \<nu>\<rparr>)" |
348 show "(\<lambda>x. 1) \<in> borel_measurable M" "AE x in M. 0 \<le> (1 :: ereal)" by auto |
347 using eq by (intro measure_space_cong) auto |
349 show "density M (\<lambda>x. 1) = M" |
348 show "absolutely_continuous \<nu>" |
350 apply (auto intro!: measure_eqI emeasure_density) |
349 unfolding absolutely_continuous_def using eq by auto |
351 apply (subst emeasure_density) |
350 show "(\<lambda>x. 1) \<in> borel_measurable M" "AE x. 0 \<le> (1 :: ereal)" by auto |
352 apply auto |
351 fix A assume "A \<in> sets M" |
353 done |
352 with eq show "\<nu> A = \<integral>\<^isup>+ x. 1 * indicator A x \<partial>M" by simp |
|
353 qed |
354 qed |
354 then have "AE x. log b (real (RN_deriv M \<nu> x)) = 0" |
355 then have "AE x in M. log b (real (RN_deriv M M x)) = 0" |
355 by (elim AE_mp) simp |
356 by (elim AE_mp) simp |
356 from integral_cong_AE[OF this] |
357 from integral_cong_AE[OF this] |
357 have "integral\<^isup>L M (entropy_density b M \<nu>) = 0" |
358 have "integral\<^isup>L M (entropy_density b M M) = 0" |
358 by (simp add: entropy_density_def comp_def) |
359 by (simp add: entropy_density_def comp_def) |
359 with eq show "KL_divergence b M \<nu> = 0" |
360 then show "KL_divergence b M M = 0" |
360 unfolding KL_divergence_def |
361 unfolding KL_divergence_def |
361 by (subst integral_cong_measure) auto |
362 by auto |
362 qed |
363 qed |
363 |
364 |
364 lemma (in information_space) KL_eq_0_imp: |
365 lemma (in information_space) KL_eq_0_iff_eq: |
365 assumes ps: "prob_space (M\<lparr>measure := \<nu>\<rparr>)" |
366 fixes D :: "'a \<Rightarrow> real" |
366 assumes ac: "absolutely_continuous \<nu>" |
367 assumes "prob_space (density M D)" |
367 assumes int: "integrable (M\<lparr> measure := \<nu> \<rparr>) (entropy_density b M \<nu>)" |
368 assumes D: "D \<in> borel_measurable M" "AE x in M. 0 \<le> D x" |
368 assumes KL: "KL_divergence b M \<nu> = 0" |
369 assumes int: "integrable M (\<lambda>x. D x * log b (D x))" |
369 shows "\<forall>A\<in>sets M. \<nu> A = \<mu> A" |
370 shows "KL_divergence b M (density M D) = 0 \<longleftrightarrow> density M D = M" |
370 by (metis less_imp_neq KL_gt_0 assms) |
371 using KL_same_eq_0[of b] KL_gt_0[OF assms] |
371 |
372 by (auto simp: less_le) |
372 lemma (in information_space) KL_ge_0: |
373 |
373 assumes ps: "prob_space (M\<lparr>measure := \<nu>\<rparr>)" |
374 lemma (in information_space) KL_eq_0_iff_eq_ac: |
374 assumes ac: "absolutely_continuous \<nu>" |
375 fixes D :: "'a \<Rightarrow> real" |
375 assumes int: "integrable (M\<lparr> measure := \<nu> \<rparr>) (entropy_density b M \<nu>)" |
376 assumes "prob_space N" |
376 shows "0 \<le> KL_divergence b M \<nu>" |
377 assumes ac: "absolutely_continuous M N" "sets N = sets M" |
377 using KL_eq_0 KL_gt_0[OF ps ac int] |
378 assumes int: "integrable N (entropy_density b M N)" |
378 by (cases "\<forall>A\<in>sets M. \<nu> A = measure M A") (auto simp: le_less) |
379 shows "KL_divergence b M N = 0 \<longleftrightarrow> N = M" |
379 |
380 proof - |
380 |
381 interpret N: prob_space N by fact |
381 lemma (in sigma_finite_measure) KL_divergence_vimage: |
382 have "finite_measure N" by unfold_locales |
382 assumes T: "T \<in> measure_preserving M M'" |
383 from real_RN_deriv[OF this ac] guess D . note D = this |
383 and T': "T' \<in> measure_preserving (M'\<lparr> measure := \<nu>' \<rparr>) (M\<lparr> measure := \<nu> \<rparr>)" |
384 |
384 and inv: "\<And>x. x \<in> space M \<Longrightarrow> T' (T x) = x" |
385 have "N = density M (RN_deriv M N)" |
385 and inv': "\<And>x. x \<in> space M' \<Longrightarrow> T (T' x) = x" |
386 using ac by (rule density_RN_deriv[symmetric]) |
386 and \<nu>': "measure_space (M'\<lparr>measure := \<nu>'\<rparr>)" "measure_space.absolutely_continuous M' \<nu>'" |
387 also have "\<dots> = density M D" |
387 and "1 < b" |
388 using borel_measurable_RN_deriv[OF ac] D by (auto intro!: density_cong) |
388 shows "KL_divergence b M' \<nu>' = KL_divergence b M \<nu>" |
389 finally have N: "N = density M D" . |
389 proof - |
390 |
390 interpret \<nu>': measure_space "M'\<lparr>measure := \<nu>'\<rparr>" by fact |
391 from absolutely_continuous_AE[OF ac(2,1) D(2)] D b_gt_1 ac measurable_entropy_density |
391 have M: "measure_space (M\<lparr> measure := \<nu>\<rparr>)" |
392 have "integrable N (\<lambda>x. log b (D x))" |
392 by (rule \<nu>'.measure_space_vimage[OF _ T'], simp) default |
393 by (intro integrable_cong_AE[THEN iffD2, OF _ _ _ int]) |
393 have "sigma_algebra (M'\<lparr> measure := \<nu>'\<rparr>)" by default |
394 (auto simp: N entropy_density_def) |
394 then have saM': "sigma_algebra M'" by simp |
395 with D b_gt_1 have "integrable M (\<lambda>x. D x * log b (D x))" |
395 then interpret M': measure_space M' by (rule measure_space_vimage) fact |
396 by (subst integral_density(2)[symmetric]) (auto simp: N[symmetric] comp_def) |
396 have ac: "absolutely_continuous \<nu>" unfolding absolutely_continuous_def |
397 with `prob_space N` D show ?thesis |
397 proof safe |
398 unfolding N |
398 fix N assume N: "N \<in> sets M" and N_0: "\<mu> N = 0" |
399 by (intro KL_eq_0_iff_eq) auto |
399 then have N': "T' -` N \<inter> space M' \<in> sets M'" |
400 qed |
400 using T' by (auto simp: measurable_def measure_preserving_def) |
401 |
401 have "T -` (T' -` N \<inter> space M') \<inter> space M = N" |
402 lemma (in information_space) KL_nonneg: |
402 using inv T N sets_into_space[OF N] by (auto simp: measurable_def measure_preserving_def) |
403 assumes "prob_space (density M D)" |
403 then have "measure M' (T' -` N \<inter> space M') = 0" |
404 assumes D: "D \<in> borel_measurable M" "AE x in M. 0 \<le> D x" |
404 using measure_preservingD[OF T N'] N_0 by auto |
405 assumes int: "integrable M (\<lambda>x. D x * log b (D x))" |
405 with \<nu>'(2) N' show "\<nu> N = 0" using measure_preservingD[OF T', of N] N |
406 shows "0 \<le> KL_divergence b M (density M D)" |
406 unfolding M'.absolutely_continuous_def measurable_def by auto |
407 using KL_gt_0[OF assms] by (cases "density M D = M") (auto simp: KL_same_eq_0) |
|
408 |
|
409 lemma (in sigma_finite_measure) KL_density_density_nonneg: |
|
410 fixes f g :: "'a \<Rightarrow> real" |
|
411 assumes "1 < b" |
|
412 assumes f: "f \<in> borel_measurable M" "AE x in M. 0 \<le> f x" "prob_space (density M f)" |
|
413 assumes g: "g \<in> borel_measurable M" "AE x in M. 0 \<le> g x" "prob_space (density M g)" |
|
414 assumes ac: "AE x in M. f x = 0 \<longrightarrow> g x = 0" |
|
415 assumes int: "integrable M (\<lambda>x. g x * log b (g x / f x))" |
|
416 shows "0 \<le> KL_divergence b (density M f) (density M g)" |
|
417 proof - |
|
418 interpret Mf: prob_space "density M f" by fact |
|
419 interpret Mf: information_space "density M f" b by default fact |
|
420 have eq: "density (density M f) (\<lambda>x. g x / f x) = density M g" (is "?DD = _") |
|
421 using f g ac by (subst density_density_divide) simp_all |
|
422 |
|
423 have "0 \<le> KL_divergence b (density M f) (density (density M f) (\<lambda>x. g x / f x))" |
|
424 proof (rule Mf.KL_nonneg) |
|
425 show "prob_space ?DD" unfolding eq by fact |
|
426 from f g show "(\<lambda>x. g x / f x) \<in> borel_measurable (density M f)" |
|
427 by auto |
|
428 show "AE x in density M f. 0 \<le> g x / f x" |
|
429 using f g by (auto simp: AE_density divide_nonneg_nonneg) |
|
430 show "integrable (density M f) (\<lambda>x. g x / f x * log b (g x / f x))" |
|
431 using `1 < b` f g ac |
|
432 by (subst integral_density) |
|
433 (auto intro!: integrable_cong_AE[THEN iffD2, OF _ _ _ int] measurable_If) |
407 qed |
434 qed |
408 |
435 also have "\<dots> = KL_divergence b (density M f) (density M g)" |
409 have sa: "sigma_algebra (M\<lparr>measure := \<nu>\<rparr>)" by simp default |
436 using f g ac by (subst density_density_divide) simp_all |
410 have AE: "AE x. RN_deriv M' \<nu>' (T x) = RN_deriv M \<nu> x" |
|
411 by (rule RN_deriv_vimage[OF T T' inv \<nu>']) |
|
412 show ?thesis |
|
413 unfolding KL_divergence_def entropy_density_def comp_def |
|
414 proof (subst \<nu>'.integral_vimage[OF sa T']) |
|
415 show "(\<lambda>x. log b (real (RN_deriv M \<nu> x))) \<in> borel_measurable (M\<lparr>measure := \<nu>\<rparr>)" |
|
416 by (auto intro!: RN_deriv[OF M ac] borel_measurable_log[OF _ `1 < b`]) |
|
417 have "(\<integral> x. log b (real (RN_deriv M' \<nu>' x)) \<partial>M'\<lparr>measure := \<nu>'\<rparr>) = |
|
418 (\<integral> x. log b (real (RN_deriv M' \<nu>' (T (T' x)))) \<partial>M'\<lparr>measure := \<nu>'\<rparr>)" (is "?l = _") |
|
419 using inv' by (auto intro!: \<nu>'.integral_cong) |
|
420 also have "\<dots> = (\<integral> x. log b (real (RN_deriv M \<nu> (T' x))) \<partial>M'\<lparr>measure := \<nu>'\<rparr>)" (is "_ = ?r") |
|
421 using M ac AE |
|
422 by (intro \<nu>'.integral_cong_AE \<nu>'.almost_everywhere_vimage[OF sa T'] absolutely_continuous_AE[OF M]) |
|
423 (auto elim!: AE_mp) |
|
424 finally show "?l = ?r" . |
|
425 qed |
|
426 qed |
|
427 |
|
428 lemma (in sigma_finite_measure) KL_divergence_cong: |
|
429 assumes "measure_space (M\<lparr>measure := \<nu>\<rparr>)" (is "measure_space ?\<nu>") |
|
430 assumes [simp]: "sets N = sets M" "space N = space M" |
|
431 "\<And>A. A \<in> sets M \<Longrightarrow> measure N A = \<mu> A" |
|
432 "\<And>A. A \<in> sets M \<Longrightarrow> \<nu> A = \<nu>' A" |
|
433 shows "KL_divergence b M \<nu> = KL_divergence b N \<nu>'" |
|
434 proof - |
|
435 interpret \<nu>: measure_space ?\<nu> by fact |
|
436 have "KL_divergence b M \<nu> = \<integral>x. log b (real (RN_deriv N \<nu>' x)) \<partial>?\<nu>" |
|
437 by (simp cong: RN_deriv_cong \<nu>.integral_cong add: KL_divergence_def entropy_density_def) |
|
438 also have "\<dots> = KL_divergence b N \<nu>'" |
|
439 by (auto intro!: \<nu>.integral_cong_measure[symmetric] simp: KL_divergence_def entropy_density_def comp_def) |
|
440 finally show ?thesis . |
437 finally show ?thesis . |
441 qed |
|
442 |
|
443 lemma (in finite_measure_space) KL_divergence_eq_finite: |
|
444 assumes v: "finite_measure_space (M\<lparr>measure := \<nu>\<rparr>)" |
|
445 assumes ac: "absolutely_continuous \<nu>" |
|
446 shows "KL_divergence b M \<nu> = (\<Sum>x\<in>space M. real (\<nu> {x}) * log b (real (\<nu> {x}) / real (\<mu> {x})))" (is "_ = ?sum") |
|
447 proof (simp add: KL_divergence_def finite_measure_space.integral_finite_singleton[OF v] entropy_density_def) |
|
448 interpret v: finite_measure_space "M\<lparr>measure := \<nu>\<rparr>" by fact |
|
449 have ms: "measure_space (M\<lparr>measure := \<nu>\<rparr>)" by default |
|
450 show "(\<Sum>x \<in> space M. log b (real (RN_deriv M \<nu> x)) * real (\<nu> {x})) = ?sum" |
|
451 using RN_deriv_finite_measure[OF ms ac] |
|
452 by (auto intro!: setsum_cong simp: field_simps) |
|
453 qed |
|
454 |
|
455 lemma (in finite_prob_space) KL_divergence_positive_finite: |
|
456 assumes v: "finite_prob_space (M\<lparr>measure := \<nu>\<rparr>)" |
|
457 assumes ac: "absolutely_continuous \<nu>" |
|
458 and "1 < b" |
|
459 shows "0 \<le> KL_divergence b M \<nu>" |
|
460 proof - |
|
461 interpret information_space M by default fact |
|
462 interpret v: finite_prob_space "M\<lparr>measure := \<nu>\<rparr>" by fact |
|
463 have ps: "prob_space (M\<lparr>measure := \<nu>\<rparr>)" by unfold_locales |
|
464 from KL_ge_0[OF this ac v.integral_finite_singleton(1)] show ?thesis . |
|
465 qed |
438 qed |
466 |
439 |
467 subsection {* Mutual Information *} |
440 subsection {* Mutual Information *} |
468 |
441 |
469 definition (in prob_space) |
442 definition (in prob_space) |
470 "mutual_information b S T X Y = |
443 "mutual_information b S T X Y = |
471 KL_divergence b (S\<lparr>measure := ereal\<circ>distribution X\<rparr> \<Otimes>\<^isub>M T\<lparr>measure := ereal\<circ>distribution Y\<rparr>) |
444 KL_divergence b (distr M S X \<Otimes>\<^isub>M distr M T Y) (distr M (S \<Otimes>\<^isub>M T) (\<lambda>x. (X x, Y x)))" |
472 (ereal\<circ>joint_distribution X Y)" |
445 |
473 |
446 lemma (in information_space) mutual_information_indep_vars: |
474 lemma (in information_space) |
|
475 fixes S T X Y |
447 fixes S T X Y |
476 defines "P \<equiv> S\<lparr>measure := ereal\<circ>distribution X\<rparr> \<Otimes>\<^isub>M T\<lparr>measure := ereal\<circ>distribution Y\<rparr>" |
448 defines "P \<equiv> distr M S X \<Otimes>\<^isub>M distr M T Y" |
|
449 defines "Q \<equiv> distr M (S \<Otimes>\<^isub>M T) (\<lambda>x. (X x, Y x))" |
477 shows "indep_var S X T Y \<longleftrightarrow> |
450 shows "indep_var S X T Y \<longleftrightarrow> |
478 (random_variable S X \<and> random_variable T Y \<and> |
451 (random_variable S X \<and> random_variable T Y \<and> |
479 measure_space.absolutely_continuous P (ereal\<circ>joint_distribution X Y) \<and> |
452 absolutely_continuous P Q \<and> integrable Q (entropy_density b P Q) \<and> |
480 integrable (P\<lparr>measure := (ereal\<circ>joint_distribution X Y)\<rparr>) |
453 mutual_information b S T X Y = 0)" |
481 (entropy_density b P (ereal\<circ>joint_distribution X Y)) \<and> |
454 unfolding indep_var_distribution_eq |
482 mutual_information b S T X Y = 0)" |
|
483 proof safe |
455 proof safe |
484 assume indep: "indep_var S X T Y" |
456 assume rv: "random_variable S X" "random_variable T Y" |
485 then have "random_variable S X" "random_variable T Y" |
457 |
486 by (blast dest: indep_var_rv1 indep_var_rv2)+ |
458 interpret X: prob_space "distr M S X" |
487 then show "sigma_algebra S" "X \<in> measurable M S" "sigma_algebra T" "Y \<in> measurable M T" |
459 by (rule prob_space_distr) fact |
488 by blast+ |
460 interpret Y: prob_space "distr M T Y" |
489 |
461 by (rule prob_space_distr) fact |
490 interpret X: prob_space "S\<lparr>measure := ereal\<circ>distribution X\<rparr>" |
462 interpret XY: pair_prob_space "distr M S X" "distr M T Y" by default |
491 by (rule distribution_prob_space) fact |
463 interpret P: information_space P b unfolding P_def by default (rule b_gt_1) |
492 interpret Y: prob_space "T\<lparr>measure := ereal\<circ>distribution Y\<rparr>" |
464 |
493 by (rule distribution_prob_space) fact |
465 interpret Q: prob_space Q unfolding Q_def |
494 interpret XY: pair_prob_space "S\<lparr>measure := ereal\<circ>distribution X\<rparr>" "T\<lparr>measure := ereal\<circ>distribution Y\<rparr>" by default |
466 by (rule prob_space_distr) (simp add: comp_def measurable_pair_iff rv) |
495 interpret XY: information_space XY.P b by default (rule b_gt_1) |
467 |
496 |
468 { assume "distr M S X \<Otimes>\<^isub>M distr M T Y = distr M (S \<Otimes>\<^isub>M T) (\<lambda>x. (X x, Y x))" |
497 let ?J = "XY.P\<lparr> measure := (ereal\<circ>joint_distribution X Y) \<rparr>" |
469 then have [simp]: "Q = P" unfolding Q_def P_def by simp |
498 { fix A assume "A \<in> sets XY.P" |
470 |
499 then have "ereal (joint_distribution X Y A) = XY.\<mu> A" |
471 show ac: "absolutely_continuous P Q" by (simp add: absolutely_continuous_def) |
500 using indep_var_distributionD[OF indep] |
472 then have ed: "entropy_density b P Q \<in> borel_measurable P" |
501 by (simp add: XY.P.finite_measure_eq) } |
473 by (rule P.measurable_entropy_density) simp |
502 note j_eq = this |
474 |
503 |
475 have "AE x in P. 1 = RN_deriv P Q x" |
504 interpret J: prob_space ?J |
476 proof (rule P.RN_deriv_unique) |
505 using j_eq by (intro XY.prob_space_cong) auto |
477 show "density P (\<lambda>x. 1) = Q" |
506 |
478 unfolding `Q = P` by (intro measure_eqI) (auto simp: emeasure_density) |
507 have ac: "XY.absolutely_continuous (ereal\<circ>joint_distribution X Y)" |
479 qed auto |
508 by (simp add: XY.absolutely_continuous_def j_eq) |
480 then have ae_0: "AE x in P. entropy_density b P Q x = 0" |
509 then show "measure_space.absolutely_continuous P (ereal\<circ>joint_distribution X Y)" |
481 by eventually_elim (auto simp: entropy_density_def) |
510 unfolding P_def . |
482 then have "integrable P (entropy_density b P Q) \<longleftrightarrow> integrable Q (\<lambda>x. 0)" |
511 |
483 using ed unfolding `Q = P` by (intro integrable_cong_AE) auto |
512 have ed: "entropy_density b XY.P (ereal\<circ>joint_distribution X Y) \<in> borel_measurable XY.P" |
484 then show "integrable Q (entropy_density b P Q)" by simp |
513 by (rule XY.measurable_entropy_density) (default | fact)+ |
485 |
514 |
486 show "mutual_information b S T X Y = 0" |
515 have "AE x in XY.P. 1 = RN_deriv XY.P (ereal\<circ>joint_distribution X Y) x" |
487 unfolding mutual_information_def KL_divergence_def P_def[symmetric] Q_def[symmetric] `Q = P` |
516 proof (rule XY.RN_deriv_unique[OF _ ac]) |
488 using ae_0 by (simp cong: integral_cong_AE) } |
517 show "measure_space ?J" by default |
489 |
518 fix A assume "A \<in> sets XY.P" |
490 { assume ac: "absolutely_continuous P Q" |
519 then show "(ereal\<circ>joint_distribution X Y) A = (\<integral>\<^isup>+ x. 1 * indicator A x \<partial>XY.P)" |
491 assume int: "integrable Q (entropy_density b P Q)" |
520 by (simp add: j_eq) |
492 assume I_eq_0: "mutual_information b S T X Y = 0" |
521 qed (insert XY.measurable_const[of 1 borel], auto) |
493 |
522 then have ae_XY: "AE x in XY.P. entropy_density b XY.P (ereal\<circ>joint_distribution X Y) x = 0" |
494 have eq: "Q = P" |
523 by (elim XY.AE_mp) (simp add: entropy_density_def) |
495 proof (rule P.KL_eq_0_iff_eq_ac[THEN iffD1]) |
524 have ae_J: "AE x in ?J. entropy_density b XY.P (ereal\<circ>joint_distribution X Y) x = 0" |
496 show "prob_space Q" by unfold_locales |
525 proof (rule XY.absolutely_continuous_AE) |
497 show "absolutely_continuous P Q" by fact |
526 show "measure_space ?J" by default |
498 show "integrable Q (entropy_density b P Q)" by fact |
527 show "XY.absolutely_continuous (measure ?J)" |
499 show "sets Q = sets P" by (simp add: P_def Q_def sets_pair_measure) |
528 using ac by simp |
500 show "KL_divergence b P Q = 0" |
529 qed (insert ae_XY, simp_all) |
501 using I_eq_0 unfolding mutual_information_def by (simp add: P_def Q_def) |
530 then show "integrable (P\<lparr>measure := (ereal\<circ>joint_distribution X Y)\<rparr>) |
|
531 (entropy_density b P (ereal\<circ>joint_distribution X Y))" |
|
532 unfolding P_def |
|
533 using ed XY.measurable_const[of 0 borel] |
|
534 by (subst J.integrable_cong_AE) auto |
|
535 |
|
536 show "mutual_information b S T X Y = 0" |
|
537 unfolding mutual_information_def KL_divergence_def P_def |
|
538 by (subst J.integral_cong_AE[OF ae_J]) simp |
|
539 next |
|
540 assume "sigma_algebra S" "X \<in> measurable M S" "sigma_algebra T" "Y \<in> measurable M T" |
|
541 then have rvs: "random_variable S X" "random_variable T Y" by blast+ |
|
542 |
|
543 interpret X: prob_space "S\<lparr>measure := ereal\<circ>distribution X\<rparr>" |
|
544 by (rule distribution_prob_space) fact |
|
545 interpret Y: prob_space "T\<lparr>measure := ereal\<circ>distribution Y\<rparr>" |
|
546 by (rule distribution_prob_space) fact |
|
547 interpret XY: pair_prob_space "S\<lparr>measure := ereal\<circ>distribution X\<rparr>" "T\<lparr>measure := ereal\<circ>distribution Y\<rparr>" by default |
|
548 interpret XY: information_space XY.P b by default (rule b_gt_1) |
|
549 |
|
550 let ?J = "XY.P\<lparr> measure := (ereal\<circ>joint_distribution X Y) \<rparr>" |
|
551 interpret J: prob_space ?J |
|
552 using rvs by (intro joint_distribution_prob_space) auto |
|
553 |
|
554 assume ac: "measure_space.absolutely_continuous P (ereal\<circ>joint_distribution X Y)" |
|
555 assume int: "integrable (P\<lparr>measure := (ereal\<circ>joint_distribution X Y)\<rparr>) |
|
556 (entropy_density b P (ereal\<circ>joint_distribution X Y))" |
|
557 assume I_eq_0: "mutual_information b S T X Y = 0" |
|
558 |
|
559 have eq: "\<forall>A\<in>sets XY.P. (ereal \<circ> joint_distribution X Y) A = XY.\<mu> A" |
|
560 proof (rule XY.KL_eq_0_imp) |
|
561 show "prob_space ?J" by unfold_locales |
|
562 show "XY.absolutely_continuous (ereal\<circ>joint_distribution X Y)" |
|
563 using ac by (simp add: P_def) |
|
564 show "integrable ?J (entropy_density b XY.P (ereal\<circ>joint_distribution X Y))" |
|
565 using int by (simp add: P_def) |
|
566 show "KL_divergence b XY.P (ereal\<circ>joint_distribution X Y) = 0" |
|
567 using I_eq_0 unfolding mutual_information_def by (simp add: P_def) |
|
568 qed |
|
569 |
|
570 { fix S X assume "sigma_algebra S" |
|
571 interpret S: sigma_algebra S by fact |
|
572 have "Int_stable \<lparr>space = space M, sets = {X -` A \<inter> space M |A. A \<in> sets S}\<rparr>" |
|
573 proof (safe intro!: Int_stableI) |
|
574 fix A B assume "A \<in> sets S" "B \<in> sets S" |
|
575 then show "\<exists>C. (X -` A \<inter> space M) \<inter> (X -` B \<inter> space M) = (X -` C \<inter> space M) \<and> C \<in> sets S" |
|
576 by (intro exI[of _ "A \<inter> B"]) auto |
|
577 qed } |
|
578 note Int_stable = this |
|
579 |
|
580 show "indep_var S X T Y" unfolding indep_var_eq |
|
581 proof (intro conjI indep_set_sigma_sets Int_stable) |
|
582 show "indep_set {X -` A \<inter> space M |A. A \<in> sets S} {Y -` A \<inter> space M |A. A \<in> sets T}" |
|
583 proof (safe intro!: indep_setI) |
|
584 { fix A assume "A \<in> sets S" then show "X -` A \<inter> space M \<in> sets M" |
|
585 using `X \<in> measurable M S` by (auto intro: measurable_sets) } |
|
586 { fix A assume "A \<in> sets T" then show "Y -` A \<inter> space M \<in> sets M" |
|
587 using `Y \<in> measurable M T` by (auto intro: measurable_sets) } |
|
588 next |
|
589 fix A B assume ab: "A \<in> sets S" "B \<in> sets T" |
|
590 have "ereal (prob ((X -` A \<inter> space M) \<inter> (Y -` B \<inter> space M))) = |
|
591 ereal (joint_distribution X Y (A \<times> B))" |
|
592 unfolding distribution_def |
|
593 by (intro arg_cong[where f="\<lambda>C. ereal (prob C)"]) auto |
|
594 also have "\<dots> = XY.\<mu> (A \<times> B)" |
|
595 using ab eq by (auto simp: XY.finite_measure_eq) |
|
596 also have "\<dots> = ereal (distribution X A) * ereal (distribution Y B)" |
|
597 using ab by (simp add: XY.pair_measure_times) |
|
598 finally show "prob ((X -` A \<inter> space M) \<inter> (Y -` B \<inter> space M)) = |
|
599 prob (X -` A \<inter> space M) * prob (Y -` B \<inter> space M)" |
|
600 unfolding distribution_def by simp |
|
601 qed |
502 qed |
602 qed fact+ |
503 then show "distr M S X \<Otimes>\<^isub>M distr M T Y = distr M (S \<Otimes>\<^isub>M T) (\<lambda>x. (X x, Y x))" |
603 qed |
504 unfolding P_def Q_def .. } |
604 |
505 qed |
605 lemma (in information_space) mutual_information_commute_generic: |
|
606 assumes X: "random_variable S X" and Y: "random_variable T Y" |
|
607 assumes ac: "measure_space.absolutely_continuous |
|
608 (S\<lparr>measure := ereal\<circ>distribution X\<rparr> \<Otimes>\<^isub>M T\<lparr>measure := ereal\<circ>distribution Y\<rparr>) (ereal\<circ>joint_distribution X Y)" |
|
609 shows "mutual_information b S T X Y = mutual_information b T S Y X" |
|
610 proof - |
|
611 let ?S = "S\<lparr>measure := ereal\<circ>distribution X\<rparr>" and ?T = "T\<lparr>measure := ereal\<circ>distribution Y\<rparr>" |
|
612 interpret S: prob_space ?S using X by (rule distribution_prob_space) |
|
613 interpret T: prob_space ?T using Y by (rule distribution_prob_space) |
|
614 interpret P: pair_prob_space ?S ?T .. |
|
615 interpret Q: pair_prob_space ?T ?S .. |
|
616 show ?thesis |
|
617 unfolding mutual_information_def |
|
618 proof (intro Q.KL_divergence_vimage[OF Q.measure_preserving_swap _ _ _ _ ac b_gt_1]) |
|
619 show "(\<lambda>(x,y). (y,x)) \<in> measure_preserving |
|
620 (P.P \<lparr> measure := ereal\<circ>joint_distribution X Y\<rparr>) (Q.P \<lparr> measure := ereal\<circ>joint_distribution Y X\<rparr>)" |
|
621 using X Y unfolding measurable_def |
|
622 unfolding measure_preserving_def using P.pair_sigma_algebra_swap_measurable |
|
623 by (auto simp add: space_pair_measure distribution_def intro!: arg_cong[where f=\<mu>']) |
|
624 have "prob_space (P.P\<lparr> measure := ereal\<circ>joint_distribution X Y\<rparr>)" |
|
625 using X Y by (auto intro!: distribution_prob_space random_variable_pairI) |
|
626 then show "measure_space (P.P\<lparr> measure := ereal\<circ>joint_distribution X Y\<rparr>)" |
|
627 unfolding prob_space_def finite_measure_def sigma_finite_measure_def by simp |
|
628 qed auto |
|
629 qed |
|
630 |
|
631 definition (in prob_space) |
|
632 "entropy b s X = mutual_information b s s X X" |
|
633 |
506 |
634 abbreviation (in information_space) |
507 abbreviation (in information_space) |
635 mutual_information_Pow ("\<I>'(_ ; _')") where |
508 mutual_information_Pow ("\<I>'(_ ; _')") where |
636 "\<I>(X ; Y) \<equiv> mutual_information b |
509 "\<I>(X ; Y) \<equiv> mutual_information b (count_space (X`space M)) (count_space (Y`space M)) X Y" |
637 \<lparr> space = X`space M, sets = Pow (X`space M), measure = ereal\<circ>distribution X \<rparr> |
510 |
638 \<lparr> space = Y`space M, sets = Pow (Y`space M), measure = ereal\<circ>distribution Y \<rparr> X Y" |
511 lemma (in information_space) |
639 |
512 fixes Pxy :: "'b \<times> 'c \<Rightarrow> real" and Px :: "'b \<Rightarrow> real" and Py :: "'c \<Rightarrow> real" |
640 lemma (in prob_space) finite_variables_absolutely_continuous: |
513 assumes "sigma_finite_measure S" "sigma_finite_measure T" |
641 assumes X: "finite_random_variable S X" and Y: "finite_random_variable T Y" |
514 assumes Px: "distributed M S X Px" and Py: "distributed M T Y Py" |
642 shows "measure_space.absolutely_continuous |
515 assumes Pxy: "distributed M (S \<Otimes>\<^isub>M T) (\<lambda>x. (X x, Y x)) Pxy" |
643 (S\<lparr>measure := ereal\<circ>distribution X\<rparr> \<Otimes>\<^isub>M T\<lparr>measure := ereal\<circ>distribution Y\<rparr>) |
516 defines "f \<equiv> \<lambda>x. Pxy x * log b (Pxy x / (Px (fst x) * Py (snd x)))" |
644 (ereal\<circ>joint_distribution X Y)" |
517 shows mutual_information_distr: "mutual_information b S T X Y = integral\<^isup>L (S \<Otimes>\<^isub>M T) f" (is "?M = ?R") |
645 proof - |
518 and mutual_information_nonneg: "integrable (S \<Otimes>\<^isub>M T) f \<Longrightarrow> 0 \<le> mutual_information b S T X Y" |
646 interpret X: finite_prob_space "S\<lparr>measure := ereal\<circ>distribution X\<rparr>" |
519 proof - |
647 using X by (rule distribution_finite_prob_space) |
520 have X: "random_variable S X" |
648 interpret Y: finite_prob_space "T\<lparr>measure := ereal\<circ>distribution Y\<rparr>" |
521 using Px by (auto simp: distributed_def) |
649 using Y by (rule distribution_finite_prob_space) |
522 have Y: "random_variable T Y" |
650 interpret XY: pair_finite_prob_space |
523 using Py by (auto simp: distributed_def) |
651 "S\<lparr>measure := ereal\<circ>distribution X\<rparr>" "T\<lparr> measure := ereal\<circ>distribution Y\<rparr>" by default |
524 interpret S: sigma_finite_measure S by fact |
652 interpret P: finite_prob_space "XY.P\<lparr> measure := ereal\<circ>joint_distribution X Y\<rparr>" |
525 interpret T: sigma_finite_measure T by fact |
653 using assms by (auto intro!: joint_distribution_finite_prob_space) |
526 interpret ST: pair_sigma_finite S T .. |
654 note rv = assms[THEN finite_random_variableD] |
527 interpret X: prob_space "distr M S X" using X by (rule prob_space_distr) |
655 show "XY.absolutely_continuous (ereal\<circ>joint_distribution X Y)" |
528 interpret Y: prob_space "distr M T Y" using Y by (rule prob_space_distr) |
656 proof (rule XY.absolutely_continuousI) |
529 interpret XY: pair_prob_space "distr M S X" "distr M T Y" .. |
657 show "finite_measure_space (XY.P\<lparr> measure := ereal\<circ>joint_distribution X Y\<rparr>)" by unfold_locales |
530 let ?P = "S \<Otimes>\<^isub>M T" |
658 fix x assume "x \<in> space XY.P" and "XY.\<mu> {x} = 0" |
531 let ?D = "distr M ?P (\<lambda>x. (X x, Y x))" |
659 then obtain a b where "x = (a, b)" |
532 |
660 and "distribution X {a} = 0 \<or> distribution Y {b} = 0" |
533 { fix A assume "A \<in> sets S" |
661 by (cases x) (auto simp: space_pair_measure) |
534 with X Y have "emeasure (distr M S X) A = emeasure ?D (A \<times> space T)" |
662 with finite_distribution_order(5,6)[OF X Y] |
535 by (auto simp: emeasure_distr measurable_Pair measurable_space |
663 show "(ereal \<circ> joint_distribution X Y) {x} = 0" by auto |
536 intro!: arg_cong[where f="emeasure M"]) } |
|
537 note marginal_eq1 = this |
|
538 { fix A assume "A \<in> sets T" |
|
539 with X Y have "emeasure (distr M T Y) A = emeasure ?D (space S \<times> A)" |
|
540 by (auto simp: emeasure_distr measurable_Pair measurable_space |
|
541 intro!: arg_cong[where f="emeasure M"]) } |
|
542 note marginal_eq2 = this |
|
543 |
|
544 have eq: "(\<lambda>x. ereal (Px (fst x) * Py (snd x))) = (\<lambda>(x, y). ereal (Px x) * ereal (Py y))" |
|
545 by auto |
|
546 |
|
547 have distr_eq: "distr M S X \<Otimes>\<^isub>M distr M T Y = density ?P (\<lambda>x. ereal (Px (fst x) * Py (snd x)))" |
|
548 unfolding Px(1)[THEN distributed_distr_eq_density] Py(1)[THEN distributed_distr_eq_density] eq |
|
549 proof (subst pair_measure_density) |
|
550 show "(\<lambda>x. ereal (Px x)) \<in> borel_measurable S" "(\<lambda>y. ereal (Py y)) \<in> borel_measurable T" |
|
551 "AE x in S. 0 \<le> ereal (Px x)" "AE y in T. 0 \<le> ereal (Py y)" |
|
552 using Px Py by (auto simp: distributed_def) |
|
553 show "sigma_finite_measure (density S Px)" unfolding Px(1)[THEN distributed_distr_eq_density, symmetric] .. |
|
554 show "sigma_finite_measure (density T Py)" unfolding Py(1)[THEN distributed_distr_eq_density, symmetric] .. |
|
555 qed (fact | simp)+ |
|
556 |
|
557 have M: "?M = KL_divergence b (density ?P (\<lambda>x. ereal (Px (fst x) * Py (snd x)))) (density ?P (\<lambda>x. ereal (Pxy x)))" |
|
558 unfolding mutual_information_def distr_eq Pxy(1)[THEN distributed_distr_eq_density] .. |
|
559 |
|
560 from Px Py have f: "(\<lambda>x. Px (fst x) * Py (snd x)) \<in> borel_measurable ?P" |
|
561 by (intro borel_measurable_times) (auto intro: distributed_real_measurable measurable_fst'' measurable_snd'') |
|
562 have PxPy_nonneg: "AE x in ?P. 0 \<le> Px (fst x) * Py (snd x)" |
|
563 proof (rule ST.AE_pair_measure) |
|
564 show "{x \<in> space ?P. 0 \<le> Px (fst x) * Py (snd x)} \<in> sets ?P" |
|
565 using f by auto |
|
566 show "AE x in S. AE y in T. 0 \<le> Px (fst (x, y)) * Py (snd (x, y))" |
|
567 using Px Py by (auto simp: zero_le_mult_iff dest!: distributed_real_AE) |
664 qed |
568 qed |
|
569 |
|
570 have "(AE x in ?P. Px (fst x) = 0 \<longrightarrow> Pxy x = 0)" |
|
571 by (rule subdensity_real[OF measurable_fst Pxy Px]) auto |
|
572 moreover |
|
573 have "(AE x in ?P. Py (snd x) = 0 \<longrightarrow> Pxy x = 0)" |
|
574 by (rule subdensity_real[OF measurable_snd Pxy Py]) auto |
|
575 ultimately have ac: "AE x in ?P. Px (fst x) * Py (snd x) = 0 \<longrightarrow> Pxy x = 0" |
|
576 by eventually_elim auto |
|
577 |
|
578 show "?M = ?R" |
|
579 unfolding M f_def |
|
580 using b_gt_1 f PxPy_nonneg Pxy[THEN distributed_real_measurable] Pxy[THEN distributed_real_AE] ac |
|
581 by (rule ST.KL_density_density) |
|
582 |
|
583 assume int: "integrable (S \<Otimes>\<^isub>M T) f" |
|
584 show "0 \<le> ?M" unfolding M |
|
585 proof (rule ST.KL_density_density_nonneg |
|
586 [OF b_gt_1 f PxPy_nonneg _ Pxy[THEN distributed_real_measurable] Pxy[THEN distributed_real_AE] _ ac int[unfolded f_def]]) |
|
587 show "prob_space (density (S \<Otimes>\<^isub>M T) (\<lambda>x. ereal (Pxy x))) " |
|
588 unfolding distributed_distr_eq_density[OF Pxy, symmetric] |
|
589 using distributed_measurable[OF Pxy] by (rule prob_space_distr) |
|
590 show "prob_space (density (S \<Otimes>\<^isub>M T) (\<lambda>x. ereal (Px (fst x) * Py (snd x))))" |
|
591 unfolding distr_eq[symmetric] by unfold_locales |
|
592 qed |
665 qed |
593 qed |
666 |
594 |
667 lemma (in information_space) |
595 lemma (in information_space) |
668 assumes MX: "finite_random_variable MX X" |
596 fixes Pxy :: "'b \<times> 'c \<Rightarrow> real" and Px :: "'b \<Rightarrow> real" and Py :: "'c \<Rightarrow> real" |
669 assumes MY: "finite_random_variable MY Y" |
597 assumes "sigma_finite_measure S" "sigma_finite_measure T" |
670 shows mutual_information_generic_eq: |
598 assumes Px: "distributed M S X Px" and Py: "distributed M T Y Py" |
671 "mutual_information b MX MY X Y = (\<Sum> (x,y) \<in> space MX \<times> space MY. |
599 assumes Pxy: "distributed M (S \<Otimes>\<^isub>M T) (\<lambda>x. (X x, Y x)) Pxy" |
672 joint_distribution X Y {(x,y)} * |
600 assumes ae: "AE x in S. AE y in T. Pxy (x, y) = Px x * Py y" |
673 log b (joint_distribution X Y {(x,y)} / |
601 shows mutual_information_eq_0: "mutual_information b S T X Y = 0" |
674 (distribution X {x} * distribution Y {y})))" |
602 proof - |
675 (is ?sum) |
603 interpret S: sigma_finite_measure S by fact |
676 and mutual_information_positive_generic: |
604 interpret T: sigma_finite_measure T by fact |
677 "0 \<le> mutual_information b MX MY X Y" (is ?positive) |
605 interpret ST: pair_sigma_finite S T .. |
678 proof - |
606 |
679 interpret X: finite_prob_space "MX\<lparr>measure := ereal\<circ>distribution X\<rparr>" |
607 have "AE x in S \<Otimes>\<^isub>M T. Px (fst x) = 0 \<longrightarrow> Pxy x = 0" |
680 using MX by (rule distribution_finite_prob_space) |
608 by (rule subdensity_real[OF measurable_fst Pxy Px]) auto |
681 interpret Y: finite_prob_space "MY\<lparr>measure := ereal\<circ>distribution Y\<rparr>" |
609 moreover |
682 using MY by (rule distribution_finite_prob_space) |
610 have "AE x in S \<Otimes>\<^isub>M T. Py (snd x) = 0 \<longrightarrow> Pxy x = 0" |
683 interpret XY: pair_finite_prob_space "MX\<lparr>measure := ereal\<circ>distribution X\<rparr>" "MY\<lparr>measure := ereal\<circ>distribution Y\<rparr>" by default |
611 by (rule subdensity_real[OF measurable_snd Pxy Py]) auto |
684 interpret P: finite_prob_space "XY.P\<lparr>measure := ereal\<circ>joint_distribution X Y\<rparr>" |
612 moreover |
685 using assms by (auto intro!: joint_distribution_finite_prob_space) |
613 have "AE x in S \<Otimes>\<^isub>M T. Pxy x = Px (fst x) * Py (snd x)" |
686 |
614 using distributed_real_measurable[OF Px] distributed_real_measurable[OF Py] distributed_real_measurable[OF Pxy] |
687 have P_ms: "finite_measure_space (XY.P\<lparr>measure := ereal\<circ>joint_distribution X Y\<rparr>)" by unfold_locales |
615 by (intro ST.AE_pair_measure) (auto simp: ae intro!: measurable_snd'' measurable_fst'') |
688 have P_ps: "finite_prob_space (XY.P\<lparr>measure := ereal\<circ>joint_distribution X Y\<rparr>)" by unfold_locales |
616 ultimately have "AE x in S \<Otimes>\<^isub>M T. Pxy x * log b (Pxy x / (Px (fst x) * Py (snd x))) = 0" |
689 |
617 by eventually_elim simp |
690 show ?sum |
618 then have "(\<integral>x. Pxy x * log b (Pxy x / (Px (fst x) * Py (snd x))) \<partial>(S \<Otimes>\<^isub>M T)) = (\<integral>x. 0 \<partial>(S \<Otimes>\<^isub>M T))" |
691 unfolding Let_def mutual_information_def |
619 by (rule integral_cong_AE) |
692 by (subst XY.KL_divergence_eq_finite[OF P_ms finite_variables_absolutely_continuous[OF MX MY]]) |
620 then show ?thesis |
693 (auto simp add: space_pair_measure setsum_cartesian_product') |
621 by (subst mutual_information_distr[OF assms(1-5)]) simp |
694 |
622 qed |
695 show ?positive |
623 |
696 using XY.KL_divergence_positive_finite[OF P_ps finite_variables_absolutely_continuous[OF MX MY] b_gt_1] |
624 lemma (in information_space) mutual_information_simple_distributed: |
697 unfolding mutual_information_def . |
625 assumes X: "simple_distributed M X Px" and Y: "simple_distributed M Y Py" |
698 qed |
626 assumes XY: "simple_distributed M (\<lambda>x. (X x, Y x)) Pxy" |
699 |
627 shows "\<I>(X ; Y) = (\<Sum>(x, y)\<in>(\<lambda>x. (X x, Y x))`space M. Pxy (x, y) * log b (Pxy (x, y) / (Px x * Py y)))" |
700 lemma (in information_space) mutual_information_commute: |
628 proof (subst mutual_information_distr[OF _ _ simple_distributed[OF X] simple_distributed[OF Y] simple_distributed_joint[OF XY]]) |
701 assumes X: "finite_random_variable S X" and Y: "finite_random_variable T Y" |
629 note fin = simple_distributed_joint_finite[OF XY, simp] |
702 shows "mutual_information b S T X Y = mutual_information b T S Y X" |
630 show "sigma_finite_measure (count_space (X ` space M))" |
703 unfolding mutual_information_generic_eq[OF X Y] mutual_information_generic_eq[OF Y X] |
631 by (simp add: sigma_finite_measure_count_space_finite) |
704 unfolding joint_distribution_commute_singleton[of X Y] |
632 show "sigma_finite_measure (count_space (Y ` space M))" |
705 by (auto simp add: ac_simps intro!: setsum_reindex_cong[OF swap_inj_on]) |
633 by (simp add: sigma_finite_measure_count_space_finite) |
706 |
634 let ?Pxy = "\<lambda>x. (if x \<in> (\<lambda>x. (X x, Y x)) ` space M then Pxy x else 0)" |
707 lemma (in information_space) mutual_information_commute_simple: |
635 let ?f = "\<lambda>x. ?Pxy x * log b (?Pxy x / (Px (fst x) * Py (snd x)))" |
708 assumes X: "simple_function M X" and Y: "simple_function M Y" |
636 have "\<And>x. ?f x = (if x \<in> (\<lambda>x. (X x, Y x)) ` space M then Pxy x * log b (Pxy x / (Px (fst x) * Py (snd x))) else 0)" |
709 shows "\<I>(X;Y) = \<I>(Y;X)" |
637 by auto |
710 by (intro mutual_information_commute X Y simple_function_imp_finite_random_variable) |
638 with fin show "(\<integral> x. ?f x \<partial>(count_space (X ` space M) \<Otimes>\<^isub>M count_space (Y ` space M))) = |
711 |
639 (\<Sum>(x, y)\<in>(\<lambda>x. (X x, Y x)) ` space M. Pxy (x, y) * log b (Pxy (x, y) / (Px x * Py y)))" |
712 lemma (in information_space) mutual_information_eq: |
640 by (auto simp add: pair_measure_count_space lebesgue_integral_count_space_finite setsum_cases split_beta' |
713 assumes "simple_function M X" "simple_function M Y" |
641 intro!: setsum_cong) |
714 shows "\<I>(X;Y) = (\<Sum> (x,y) \<in> X ` space M \<times> Y ` space M. |
642 qed |
715 distribution (\<lambda>x. (X x, Y x)) {(x,y)} * log b (distribution (\<lambda>x. (X x, Y x)) {(x,y)} / |
643 |
716 (distribution X {x} * distribution Y {y})))" |
644 lemma (in information_space) |
717 using assms by (simp add: mutual_information_generic_eq) |
645 fixes Pxy :: "'b \<times> 'c \<Rightarrow> real" and Px :: "'b \<Rightarrow> real" and Py :: "'c \<Rightarrow> real" |
718 |
646 assumes Px: "simple_distributed M X Px" and Py: "simple_distributed M Y Py" |
719 lemma (in information_space) mutual_information_generic_cong: |
647 assumes Pxy: "simple_distributed M (\<lambda>x. (X x, Y x)) Pxy" |
720 assumes X: "\<And>x. x \<in> space M \<Longrightarrow> X x = X' x" |
648 assumes ae: "\<forall>x\<in>space M. Pxy (X x, Y x) = Px (X x) * Py (Y x)" |
721 assumes Y: "\<And>x. x \<in> space M \<Longrightarrow> Y x = Y' x" |
649 shows mutual_information_eq_0_simple: "\<I>(X ; Y) = 0" |
722 shows "mutual_information b MX MY X Y = mutual_information b MX MY X' Y'" |
650 proof (subst mutual_information_simple_distributed[OF Px Py Pxy]) |
723 unfolding mutual_information_def using X Y |
651 have "(\<Sum>(x, y)\<in>(\<lambda>x. (X x, Y x)) ` space M. Pxy (x, y) * log b (Pxy (x, y) / (Px x * Py y))) = |
724 by (simp cong: distribution_cong) |
652 (\<Sum>(x, y)\<in>(\<lambda>x. (X x, Y x)) ` space M. 0)" |
725 |
653 by (intro setsum_cong) (auto simp: ae) |
726 lemma (in information_space) mutual_information_cong: |
654 then show "(\<Sum>(x, y)\<in>(\<lambda>x. (X x, Y x)) ` space M. |
727 assumes X: "\<And>x. x \<in> space M \<Longrightarrow> X x = X' x" |
655 Pxy (x, y) * log b (Pxy (x, y) / (Px x * Py y))) = 0" by simp |
728 assumes Y: "\<And>x. x \<in> space M \<Longrightarrow> Y x = Y' x" |
656 qed |
729 shows "\<I>(X; Y) = \<I>(X'; Y')" |
|
730 unfolding mutual_information_def using X Y |
|
731 by (simp cong: distribution_cong image_cong) |
|
732 |
|
733 lemma (in information_space) mutual_information_positive: |
|
734 assumes "simple_function M X" "simple_function M Y" |
|
735 shows "0 \<le> \<I>(X;Y)" |
|
736 using assms by (simp add: mutual_information_positive_generic) |
|
737 |
657 |
738 subsection {* Entropy *} |
658 subsection {* Entropy *} |
|
659 |
|
660 definition (in prob_space) entropy :: "real \<Rightarrow> 'b measure \<Rightarrow> ('a \<Rightarrow> 'b) \<Rightarrow> real" where |
|
661 "entropy b S X = - KL_divergence b S (distr M S X)" |
739 |
662 |
740 abbreviation (in information_space) |
663 abbreviation (in information_space) |
741 entropy_Pow ("\<H>'(_')") where |
664 entropy_Pow ("\<H>'(_')") where |
742 "\<H>(X) \<equiv> entropy b \<lparr> space = X`space M, sets = Pow (X`space M), measure = ereal\<circ>distribution X \<rparr> X" |
665 "\<H>(X) \<equiv> entropy b (count_space (X`space M)) X" |
743 |
666 |
744 lemma (in information_space) entropy_generic_eq: |
667 lemma (in information_space) entropy_distr: |
745 fixes X :: "'a \<Rightarrow> 'c" |
668 fixes X :: "'a \<Rightarrow> 'b" |
746 assumes MX: "finite_random_variable MX X" |
669 assumes "sigma_finite_measure MX" and X: "distributed M MX X f" |
747 shows "entropy b MX X = -(\<Sum> x \<in> space MX. distribution X {x} * log b (distribution X {x}))" |
670 shows "entropy b MX X = - (\<integral>x. f x * log b (f x) \<partial>MX)" |
748 proof - |
671 proof - |
749 interpret MX: finite_prob_space "MX\<lparr>measure := ereal\<circ>distribution X\<rparr>" |
672 interpret MX: sigma_finite_measure MX by fact |
750 using MX by (rule distribution_finite_prob_space) |
673 from X show ?thesis |
751 let ?X = "\<lambda>x. distribution X {x}" |
674 unfolding entropy_def X[THEN distributed_distr_eq_density] |
752 let ?XX = "\<lambda>x y. joint_distribution X X {(x, y)}" |
675 by (subst MX.KL_density[OF b_gt_1]) (simp_all add: distributed_real_AE distributed_real_measurable) |
753 |
676 qed |
754 { fix x y :: 'c |
677 |
755 { assume "x \<noteq> y" |
678 lemma (in information_space) entropy_uniform: |
756 then have "(\<lambda>x. (X x, X x)) -` {(x,y)} \<inter> space M = {}" by auto |
679 assumes "sigma_finite_measure MX" |
757 then have "joint_distribution X X {(x, y)} = 0" by (simp add: distribution_def) } |
680 assumes A: "A \<in> sets MX" "emeasure MX A \<noteq> 0" "emeasure MX A \<noteq> \<infinity>" |
758 then have "?XX x y * log b (?XX x y / (?X x * ?X y)) = |
681 assumes X: "distributed M MX X (\<lambda>x. 1 / measure MX A * indicator A x)" |
759 (if x = y then - ?X y * log b (?X y) else 0)" |
682 shows "entropy b MX X = log b (measure MX A)" |
760 by (auto simp: log_simps zero_less_mult_iff) } |
683 proof (subst entropy_distr[OF _ X]) |
761 note remove_XX = this |
684 let ?f = "\<lambda>x. 1 / measure MX A * indicator A x" |
762 |
685 have "- (\<integral>x. ?f x * log b (?f x) \<partial>MX) = |
763 show ?thesis |
686 - (\<integral>x. (log b (1 / measure MX A) / measure MX A) * indicator A x \<partial>MX)" |
764 unfolding entropy_def mutual_information_generic_eq[OF MX MX] |
687 by (auto intro!: integral_cong simp: indicator_def) |
765 unfolding setsum_cartesian_product[symmetric] setsum_negf[symmetric] remove_XX |
688 also have "\<dots> = - log b (inverse (measure MX A))" |
766 using MX.finite_space by (auto simp: setsum_cases) |
689 using A by (subst integral_cmult(2)) |
767 qed |
690 (simp_all add: measure_def real_of_ereal_eq_0 integral_cmult inverse_eq_divide) |
768 |
691 also have "\<dots> = log b (measure MX A)" |
769 lemma (in information_space) entropy_eq: |
692 using b_gt_1 A by (subst log_inverse) (auto simp add: measure_def less_le real_of_ereal_eq_0 |
770 assumes "simple_function M X" |
693 emeasure_nonneg real_of_ereal_pos) |
771 shows "\<H>(X) = -(\<Sum> x \<in> X ` space M. distribution X {x} * log b (distribution X {x}))" |
694 finally show "- (\<integral>x. ?f x * log b (?f x) \<partial>MX) = log b (measure MX A)" by simp |
772 using assms by (simp add: entropy_generic_eq) |
695 qed fact+ |
773 |
696 |
774 lemma (in information_space) entropy_positive: |
697 lemma (in information_space) entropy_simple_distributed: |
775 "simple_function M X \<Longrightarrow> 0 \<le> \<H>(X)" |
698 fixes X :: "'a \<Rightarrow> 'b" |
776 unfolding entropy_def by (simp add: mutual_information_positive) |
699 assumes X: "simple_distributed M X f" |
777 |
700 shows "\<H>(X) = - (\<Sum>x\<in>X`space M. f x * log b (f x))" |
778 lemma (in information_space) entropy_certainty_eq_0: |
701 proof (subst entropy_distr[OF _ simple_distributed[OF X]]) |
779 assumes X: "simple_function M X" and "x \<in> X ` space M" and "distribution X {x} = 1" |
702 show "sigma_finite_measure (count_space (X ` space M))" |
780 shows "\<H>(X) = 0" |
703 using X by (simp add: sigma_finite_measure_count_space_finite simple_distributed_def) |
781 proof - |
704 show "- (\<integral>x. f x * log b (f x) \<partial>(count_space (X`space M))) = - (\<Sum>x\<in>X ` space M. f x * log b (f x))" |
782 let ?X = "\<lparr> space = X ` space M, sets = Pow (X ` space M), measure = ereal\<circ>distribution X\<rparr>" |
705 using X by (auto simp add: lebesgue_integral_count_space_finite) |
783 note simple_function_imp_finite_random_variable[OF `simple_function M X`] |
|
784 from distribution_finite_prob_space[OF this, of "\<lparr> measure = ereal\<circ>distribution X \<rparr>"] |
|
785 interpret X: finite_prob_space ?X by simp |
|
786 have "distribution X (X ` space M - {x}) = distribution X (X ` space M) - distribution X {x}" |
|
787 using X.measure_compl[of "{x}"] assms by auto |
|
788 also have "\<dots> = 0" using X.prob_space assms by auto |
|
789 finally have X0: "distribution X (X ` space M - {x}) = 0" by auto |
|
790 { fix y assume *: "y \<in> X ` space M" |
|
791 { assume asm: "y \<noteq> x" |
|
792 with * have "{y} \<subseteq> X ` space M - {x}" by auto |
|
793 from X.measure_mono[OF this] X0 asm * |
|
794 have "distribution X {y} = 0" by (auto intro: antisym) } |
|
795 then have "distribution X {y} = (if x = y then 1 else 0)" |
|
796 using assms by auto } |
|
797 note fi = this |
|
798 have y: "\<And>y. (if x = y then 1 else 0) * log b (if x = y then 1 else 0) = 0" by simp |
|
799 show ?thesis unfolding entropy_eq[OF `simple_function M X`] by (auto simp: y fi) |
|
800 qed |
706 qed |
801 |
707 |
802 lemma (in information_space) entropy_le_card_not_0: |
708 lemma (in information_space) entropy_le_card_not_0: |
803 assumes X: "simple_function M X" |
709 assumes X: "simple_distributed M X f" |
804 shows "\<H>(X) \<le> log b (card (X ` space M \<inter> {x. distribution X {x} \<noteq> 0}))" |
710 shows "\<H>(X) \<le> log b (card (X ` space M \<inter> {x. f x \<noteq> 0}))" |
805 proof - |
711 proof - |
806 let ?p = "\<lambda>x. distribution X {x}" |
712 have "\<H>(X) = (\<Sum>x\<in>X`space M. f x * log b (1 / f x))" |
807 have "\<H>(X) = (\<Sum>x\<in>X`space M. ?p x * log b (1 / ?p x))" |
713 unfolding entropy_simple_distributed[OF X] setsum_negf[symmetric] |
808 unfolding entropy_eq[OF X] setsum_negf[symmetric] |
714 using X by (auto dest: simple_distributed_nonneg intro!: setsum_cong simp: log_simps less_le) |
809 by (auto intro!: setsum_cong simp: log_simps) |
715 also have "\<dots> \<le> log b (\<Sum>x\<in>X`space M. f x * (1 / f x))" |
810 also have "\<dots> \<le> log b (\<Sum>x\<in>X`space M. ?p x * (1 / ?p x))" |
716 using not_empty b_gt_1 `simple_distributed M X f` |
811 using not_empty b_gt_1 `simple_function M X` sum_over_space_real_distribution[OF X] |
717 by (intro log_setsum') (auto simp: simple_distributed_nonneg simple_distributed_setsum_space) |
812 by (intro log_setsum') (auto simp: simple_function_def) |
718 also have "\<dots> = log b (\<Sum>x\<in>X`space M. if f x \<noteq> 0 then 1 else 0)" |
813 also have "\<dots> = log b (\<Sum>x\<in>X`space M. if ?p x \<noteq> 0 then 1 else 0)" |
|
814 by (intro arg_cong[where f="\<lambda>X. log b X"] setsum_cong) auto |
719 by (intro arg_cong[where f="\<lambda>X. log b X"] setsum_cong) auto |
815 finally show ?thesis |
720 finally show ?thesis |
816 using `simple_function M X` by (auto simp: setsum_cases real_eq_of_nat simple_function_def) |
721 using `simple_distributed M X f` by (auto simp: setsum_cases real_eq_of_nat) |
817 qed |
|
818 |
|
819 lemma (in prob_space) measure'_translate: |
|
820 assumes X: "random_variable S X" and A: "A \<in> sets S" |
|
821 shows "finite_measure.\<mu>' (S\<lparr> measure := ereal\<circ>distribution X \<rparr>) A = distribution X A" |
|
822 proof - |
|
823 interpret S: prob_space "S\<lparr> measure := ereal\<circ>distribution X \<rparr>" |
|
824 using distribution_prob_space[OF X] . |
|
825 from A show "S.\<mu>' A = distribution X A" |
|
826 unfolding S.\<mu>'_def by (simp add: distribution_def [abs_def] \<mu>'_def) |
|
827 qed |
|
828 |
|
829 lemma (in information_space) entropy_uniform_max: |
|
830 assumes X: "simple_function M X" |
|
831 assumes "\<And>x y. \<lbrakk> x \<in> X ` space M ; y \<in> X ` space M \<rbrakk> \<Longrightarrow> distribution X {x} = distribution X {y}" |
|
832 shows "\<H>(X) = log b (real (card (X ` space M)))" |
|
833 proof - |
|
834 let ?X = "\<lparr> space = X ` space M, sets = Pow (X ` space M), measure = undefined\<rparr>\<lparr> measure := ereal\<circ>distribution X\<rparr>" |
|
835 note frv = simple_function_imp_finite_random_variable[OF X] |
|
836 from distribution_finite_prob_space[OF this, of "\<lparr> measure = ereal\<circ>distribution X \<rparr>"] |
|
837 interpret X: finite_prob_space ?X by simp |
|
838 note rv = finite_random_variableD[OF frv] |
|
839 have card_gt0: "0 < card (X ` space M)" unfolding card_gt_0_iff |
|
840 using `simple_function M X` not_empty by (auto simp: simple_function_def) |
|
841 { fix x assume "x \<in> space ?X" |
|
842 moreover then have "X.\<mu>' {x} = 1 / card (space ?X)" |
|
843 proof (rule X.uniform_prob) |
|
844 fix x y assume "x \<in> space ?X" "y \<in> space ?X" |
|
845 with assms(2)[of x y] show "X.\<mu>' {x} = X.\<mu>' {y}" |
|
846 by (subst (1 2) measure'_translate[OF rv]) auto |
|
847 qed |
|
848 ultimately have "distribution X {x} = 1 / card (space ?X)" |
|
849 by (subst (asm) measure'_translate[OF rv]) auto } |
|
850 thus ?thesis |
|
851 using not_empty X.finite_space b_gt_1 card_gt0 |
|
852 by (simp add: entropy_eq[OF `simple_function M X`] real_eq_of_nat[symmetric] log_simps) |
|
853 qed |
722 qed |
854 |
723 |
855 lemma (in information_space) entropy_le_card: |
724 lemma (in information_space) entropy_le_card: |
856 assumes "simple_function M X" |
725 assumes "simple_distributed M X f" |
857 shows "\<H>(X) \<le> log b (real (card (X ` space M)))" |
726 shows "\<H>(X) \<le> log b (real (card (X ` space M)))" |
858 proof cases |
727 proof cases |
859 assume "X ` space M \<inter> {x. distribution X {x} \<noteq> 0} = {}" |
728 assume "X ` space M \<inter> {x. f x \<noteq> 0} = {}" |
860 then have "\<And>x. x\<in>X`space M \<Longrightarrow> distribution X {x} = 0" by auto |
729 then have "\<And>x. x\<in>X`space M \<Longrightarrow> f x = 0" by auto |
861 moreover |
730 moreover |
862 have "0 < card (X`space M)" |
731 have "0 < card (X`space M)" |
863 using `simple_function M X` not_empty |
732 using `simple_distributed M X f` not_empty by (auto simp: card_gt_0_iff) |
864 by (auto simp: card_gt_0_iff simple_function_def) |
|
865 then have "log b 1 \<le> log b (real (card (X`space M)))" |
733 then have "log b 1 \<le> log b (real (card (X`space M)))" |
866 using b_gt_1 by (intro log_le) auto |
734 using b_gt_1 by (intro log_le) auto |
867 ultimately show ?thesis using assms by (simp add: entropy_eq) |
735 ultimately show ?thesis using assms by (simp add: entropy_simple_distributed) |
868 next |
736 next |
869 assume False: "X ` space M \<inter> {x. distribution X {x} \<noteq> 0} \<noteq> {}" |
737 assume False: "X ` space M \<inter> {x. f x \<noteq> 0} \<noteq> {}" |
870 have "card (X ` space M \<inter> {x. distribution X {x} \<noteq> 0}) \<le> card (X ` space M)" |
738 have "card (X ` space M \<inter> {x. f x \<noteq> 0}) \<le> card (X ` space M)" |
871 (is "?A \<le> ?B") using assms not_empty by (auto intro!: card_mono simp: simple_function_def) |
739 (is "?A \<le> ?B") using assms not_empty |
|
740 by (auto intro!: card_mono simp: simple_function_def simple_distributed_def) |
872 note entropy_le_card_not_0[OF assms] |
741 note entropy_le_card_not_0[OF assms] |
873 also have "log b (real ?A) \<le> log b (real ?B)" |
742 also have "log b (real ?A) \<le> log b (real ?B)" |
874 using b_gt_1 False not_empty `?A \<le> ?B` assms |
743 using b_gt_1 False not_empty `?A \<le> ?B` assms |
875 by (auto intro!: log_le simp: card_gt_0_iff simp: simple_function_def) |
744 by (auto intro!: log_le simp: card_gt_0_iff simp: simple_distributed_def) |
876 finally show ?thesis . |
745 finally show ?thesis . |
877 qed |
|
878 |
|
879 lemma (in information_space) entropy_commute: |
|
880 assumes "simple_function M X" "simple_function M Y" |
|
881 shows "\<H>(\<lambda>x. (X x, Y x)) = \<H>(\<lambda>x. (Y x, X x))" |
|
882 proof - |
|
883 have sf: "simple_function M (\<lambda>x. (X x, Y x))" "simple_function M (\<lambda>x. (Y x, X x))" |
|
884 using assms by (auto intro: simple_function_Pair) |
|
885 have *: "(\<lambda>x. (Y x, X x))`space M = (\<lambda>(a,b). (b,a))`(\<lambda>x. (X x, Y x))`space M" |
|
886 by auto |
|
887 have inj: "\<And>X. inj_on (\<lambda>(a,b). (b,a)) X" |
|
888 by (auto intro!: inj_onI) |
|
889 show ?thesis |
|
890 unfolding sf[THEN entropy_eq] unfolding * setsum_reindex[OF inj] |
|
891 by (simp add: joint_distribution_commute[of Y X] split_beta) |
|
892 qed |
|
893 |
|
894 lemma (in information_space) entropy_eq_cartesian_product: |
|
895 assumes "simple_function M X" "simple_function M Y" |
|
896 shows "\<H>(\<lambda>x. (X x, Y x)) = -(\<Sum>x\<in>X`space M. \<Sum>y\<in>Y`space M. |
|
897 joint_distribution X Y {(x,y)} * log b (joint_distribution X Y {(x,y)}))" |
|
898 proof - |
|
899 have sf: "simple_function M (\<lambda>x. (X x, Y x))" |
|
900 using assms by (auto intro: simple_function_Pair) |
|
901 { fix x assume "x\<notin>(\<lambda>x. (X x, Y x))`space M" |
|
902 then have "(\<lambda>x. (X x, Y x)) -` {x} \<inter> space M = {}" by auto |
|
903 then have "joint_distribution X Y {x} = 0" |
|
904 unfolding distribution_def by auto } |
|
905 then show ?thesis using sf assms |
|
906 unfolding entropy_eq[OF sf] neg_equal_iff_equal setsum_cartesian_product |
|
907 by (auto intro!: setsum_mono_zero_cong_left simp: simple_function_def) |
|
908 qed |
746 qed |
909 |
747 |
910 subsection {* Conditional Mutual Information *} |
748 subsection {* Conditional Mutual Information *} |
911 |
749 |
912 definition (in prob_space) |
750 definition (in prob_space) |
915 mutual_information b MX MZ X Z" |
753 mutual_information b MX MZ X Z" |
916 |
754 |
917 abbreviation (in information_space) |
755 abbreviation (in information_space) |
918 conditional_mutual_information_Pow ("\<I>'( _ ; _ | _ ')") where |
756 conditional_mutual_information_Pow ("\<I>'( _ ; _ | _ ')") where |
919 "\<I>(X ; Y | Z) \<equiv> conditional_mutual_information b |
757 "\<I>(X ; Y | Z) \<equiv> conditional_mutual_information b |
920 \<lparr> space = X`space M, sets = Pow (X`space M), measure = ereal\<circ>distribution X \<rparr> |
758 (count_space (X ` space M)) (count_space (Y ` space M)) (count_space (Z ` space M)) X Y Z" |
921 \<lparr> space = Y`space M, sets = Pow (Y`space M), measure = ereal\<circ>distribution Y \<rparr> |
|
922 \<lparr> space = Z`space M, sets = Pow (Z`space M), measure = ereal\<circ>distribution Z \<rparr> |
|
923 X Y Z" |
|
924 |
759 |
925 lemma (in information_space) conditional_mutual_information_generic_eq: |
760 lemma (in information_space) conditional_mutual_information_generic_eq: |
926 assumes MX: "finite_random_variable MX X" |
761 assumes S: "sigma_finite_measure S" and T: "sigma_finite_measure T" and P: "sigma_finite_measure P" |
927 and MY: "finite_random_variable MY Y" |
762 assumes Px: "distributed M S X Px" |
928 and MZ: "finite_random_variable MZ Z" |
763 assumes Pz: "distributed M P Z Pz" |
929 shows "conditional_mutual_information b MX MY MZ X Y Z = (\<Sum>(x, y, z) \<in> space MX \<times> space MY \<times> space MZ. |
764 assumes Pyz: "distributed M (T \<Otimes>\<^isub>M P) (\<lambda>x. (Y x, Z x)) Pyz" |
930 distribution (\<lambda>x. (X x, Y x, Z x)) {(x, y, z)} * |
765 assumes Pxz: "distributed M (S \<Otimes>\<^isub>M P) (\<lambda>x. (X x, Z x)) Pxz" |
931 log b (distribution (\<lambda>x. (X x, Y x, Z x)) {(x, y, z)} / |
766 assumes Pxyz: "distributed M (S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P) (\<lambda>x. (X x, Y x, Z x)) Pxyz" |
932 (joint_distribution X Z {(x, z)} * (joint_distribution Y Z {(y,z)} / distribution Z {z}))))" |
767 assumes I1: "integrable (S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P) (\<lambda>(x, y, z). Pxyz (x, y, z) * log b (Pxyz (x, y, z) / (Px x * Pyz (y, z))))" |
933 (is "_ = (\<Sum>(x, y, z)\<in>?S. ?XYZ x y z * log b (?XYZ x y z / (?XZ x z * (?YZ y z / ?Z z))))") |
768 assumes I2: "integrable (S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P) (\<lambda>(x, y, z). Pxyz (x, y, z) * log b (Pxz (x, z) / (Px x * Pz z)))" |
934 proof - |
769 shows "conditional_mutual_information b S T P X Y Z |
935 let ?X = "\<lambda>x. distribution X {x}" |
770 = (\<integral>(x, y, z). Pxyz (x, y, z) * log b (Pxyz (x, y, z) / (Pxz (x, z) * (Pyz (y,z) / Pz z))) \<partial>(S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P))" |
936 note finite_var = MX MY MZ |
771 proof - |
937 note YZ = finite_random_variable_pairI[OF finite_var(2,3)] |
772 interpret S: sigma_finite_measure S by fact |
938 note XYZ = finite_random_variable_pairI[OF MX YZ] |
773 interpret T: sigma_finite_measure T by fact |
939 note XZ = finite_random_variable_pairI[OF finite_var(1,3)] |
774 interpret P: sigma_finite_measure P by fact |
940 note ZX = finite_random_variable_pairI[OF finite_var(3,1)] |
775 interpret TP: pair_sigma_finite T P .. |
941 note YZX = finite_random_variable_pairI[OF finite_var(2) ZX] |
776 interpret SP: pair_sigma_finite S P .. |
942 note order1 = |
777 interpret SPT: pair_sigma_finite "S \<Otimes>\<^isub>M P" T .. |
943 finite_distribution_order(5,6)[OF finite_var(1) YZ] |
778 interpret STP: pair_sigma_finite S "T \<Otimes>\<^isub>M P" .. |
944 finite_distribution_order(5,6)[OF finite_var(1,3)] |
779 have TP: "sigma_finite_measure (T \<Otimes>\<^isub>M P)" .. |
945 |
780 have SP: "sigma_finite_measure (S \<Otimes>\<^isub>M P)" .. |
946 note random_var = finite_var[THEN finite_random_variableD] |
781 have YZ: "random_variable (T \<Otimes>\<^isub>M P) (\<lambda>x. (Y x, Z x))" |
947 note finite = finite_var(1) YZ finite_var(3) XZ YZX |
782 using Pyz by (simp add: distributed_measurable) |
948 |
783 |
949 have order2: "\<And>x y z. \<lbrakk>x \<in> space MX; y \<in> space MY; z \<in> space MZ; joint_distribution X Z {(x, z)} = 0\<rbrakk> |
784 have Pxyz_f: "\<And>M f. f \<in> measurable M (S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P) \<Longrightarrow> (\<lambda>x. Pxyz (f x)) \<in> borel_measurable M" |
950 \<Longrightarrow> joint_distribution X (\<lambda>x. (Y x, Z x)) {(x, y, z)} = 0" |
785 using measurable_comp[OF _ Pxyz[THEN distributed_real_measurable]] by (auto simp: comp_def) |
951 unfolding joint_distribution_commute_singleton[of X] |
786 |
952 unfolding joint_distribution_assoc_singleton[symmetric] |
787 { fix f g h M |
953 using finite_distribution_order(6)[OF finite_var(2) ZX] |
788 assume f: "f \<in> measurable M S" and g: "g \<in> measurable M P" and h: "h \<in> measurable M (S \<Otimes>\<^isub>M P)" |
954 by auto |
789 from measurable_comp[OF h Pxz[THEN distributed_real_measurable]] |
955 |
790 measurable_comp[OF f Px[THEN distributed_real_measurable]] |
956 have "(\<Sum>(x, y, z)\<in>?S. ?XYZ x y z * log b (?XYZ x y z / (?XZ x z * (?YZ y z / ?Z z)))) = |
791 measurable_comp[OF g Pz[THEN distributed_real_measurable]] |
957 (\<Sum>(x, y, z)\<in>?S. ?XYZ x y z * (log b (?XYZ x y z / (?X x * ?YZ y z)) - log b (?XZ x z / (?X x * ?Z z))))" |
792 have "(\<lambda>x. log b (Pxz (h x) / (Px (f x) * Pz (g x)))) \<in> borel_measurable M" |
958 (is "(\<Sum>(x, y, z)\<in>?S. ?L x y z) = (\<Sum>(x, y, z)\<in>?S. ?R x y z)") |
793 by (simp add: comp_def b_gt_1) } |
959 proof (safe intro!: setsum_cong) |
794 note borel_log = this |
960 fix x y z assume space: "x \<in> space MX" "y \<in> space MY" "z \<in> space MZ" |
795 |
961 show "?L x y z = ?R x y z" |
796 have measurable_cut: "(\<lambda>(x, y, z). (x, z)) \<in> measurable (S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P) (S \<Otimes>\<^isub>M P)" |
|
797 by (auto simp add: split_beta' comp_def intro!: measurable_Pair measurable_snd') |
|
798 |
|
799 from Pxz Pxyz have distr_eq: "distr M (S \<Otimes>\<^isub>M P) (\<lambda>x. (X x, Z x)) = |
|
800 distr (distr M (S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P) (\<lambda>x. (X x, Y x, Z x))) (S \<Otimes>\<^isub>M P) (\<lambda>(x, y, z). (x, z))" |
|
801 by (subst distr_distr[OF measurable_cut]) (auto dest: distributed_measurable simp: comp_def) |
|
802 |
|
803 have "mutual_information b S P X Z = |
|
804 (\<integral>x. Pxz x * log b (Pxz x / (Px (fst x) * Pz (snd x))) \<partial>(S \<Otimes>\<^isub>M P))" |
|
805 by (rule mutual_information_distr[OF S P Px Pz Pxz]) |
|
806 also have "\<dots> = (\<integral>(x,y,z). Pxyz (x,y,z) * log b (Pxz (x,z) / (Px x * Pz z)) \<partial>(S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P))" |
|
807 using b_gt_1 Pxz Px Pz |
|
808 by (subst distributed_transform_integral[OF Pxyz Pxz, where T="\<lambda>(x, y, z). (x, z)"]) |
|
809 (auto simp: split_beta' intro!: measurable_Pair measurable_snd' measurable_snd'' measurable_fst'' borel_measurable_times |
|
810 dest!: distributed_real_measurable) |
|
811 finally have mi_eq: |
|
812 "mutual_information b S P X Z = (\<integral>(x,y,z). Pxyz (x,y,z) * log b (Pxz (x,z) / (Px x * Pz z)) \<partial>(S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P))" . |
|
813 |
|
814 have "AE x in S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P. Px (fst x) = 0 \<longrightarrow> Pxyz x = 0" |
|
815 by (intro subdensity_real[of fst, OF _ Pxyz Px]) auto |
|
816 moreover have "AE x in S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P. Pz (snd (snd x)) = 0 \<longrightarrow> Pxyz x = 0" |
|
817 by (intro subdensity_real[of "\<lambda>x. snd (snd x)", OF _ Pxyz Pz]) (auto intro: measurable_snd') |
|
818 moreover have "AE x in S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P. Pxz (fst x, snd (snd x)) = 0 \<longrightarrow> Pxyz x = 0" |
|
819 by (intro subdensity_real[of "\<lambda>x. (fst x, snd (snd x))", OF _ Pxyz Pxz]) (auto intro: measurable_Pair measurable_snd') |
|
820 moreover have "AE x in S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P. Pyz (snd x) = 0 \<longrightarrow> Pxyz x = 0" |
|
821 by (intro subdensity_real[of snd, OF _ Pxyz Pyz]) (auto intro: measurable_Pair) |
|
822 moreover have "AE x in S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P. 0 \<le> Px (fst x)" |
|
823 using Px by (intro STP.AE_pair_measure) (auto simp: comp_def intro!: measurable_fst'' dest: distributed_real_AE distributed_real_measurable) |
|
824 moreover have "AE x in S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P. 0 \<le> Pyz (snd x)" |
|
825 using Pyz by (intro STP.AE_pair_measure) (auto simp: comp_def intro!: measurable_snd'' dest: distributed_real_AE distributed_real_measurable) |
|
826 moreover have "AE x in S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P. 0 \<le> Pz (snd (snd x))" |
|
827 using Pz Pz[THEN distributed_real_measurable] by (auto intro!: measurable_snd'' TP.AE_pair_measure STP.AE_pair_measure AE_I2[of S] dest: distributed_real_AE) |
|
828 moreover have "AE x in S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P. 0 \<le> Pxz (fst x, snd (snd x))" |
|
829 using Pxz[THEN distributed_real_AE, THEN SP.AE_pair] |
|
830 using measurable_comp[OF measurable_Pair[OF measurable_fst measurable_comp[OF measurable_snd measurable_snd]] Pxz[THEN distributed_real_measurable], of T] |
|
831 using measurable_comp[OF measurable_snd measurable_Pair2[OF Pxz[THEN distributed_real_measurable]], of _ T] |
|
832 by (auto intro!: TP.AE_pair_measure STP.AE_pair_measure simp: comp_def) |
|
833 moreover note Pxyz[THEN distributed_real_AE] |
|
834 ultimately have "AE x in S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P. |
|
835 Pxyz x * log b (Pxyz x / (Px (fst x) * Pyz (snd x))) - |
|
836 Pxyz x * log b (Pxz (fst x, snd (snd x)) / (Px (fst x) * Pz (snd (snd x)))) = |
|
837 Pxyz x * log b (Pxyz x * Pz (snd (snd x)) / (Pxz (fst x, snd (snd x)) * Pyz (snd x))) " |
|
838 proof eventually_elim |
|
839 case (goal1 x) |
|
840 show ?case |
962 proof cases |
841 proof cases |
963 assume "?XYZ x y z \<noteq> 0" |
842 assume "Pxyz x \<noteq> 0" |
964 with space have "0 < ?X x" "0 < ?Z z" "0 < ?XZ x z" "0 < ?YZ y z" "0 < ?XYZ x y z" |
843 with goal1 have "0 < Px (fst x)" "0 < Pz (snd (snd x))" "0 < Pxz (fst x, snd (snd x))" "0 < Pyz (snd x)" "0 < Pxyz x" |
965 using order1 order2 by (auto simp: less_le) |
844 by auto |
966 with b_gt_1 show ?thesis |
845 then show ?thesis |
967 by (simp add: log_mult log_divide zero_less_mult_iff zero_less_divide_iff) |
846 using b_gt_1 by (simp add: log_simps mult_pos_pos less_imp_le field_simps) |
968 qed simp |
847 qed simp |
969 qed |
848 qed |
970 also have "\<dots> = (\<Sum>(x, y, z)\<in>?S. ?XYZ x y z * log b (?XYZ x y z / (?X x * ?YZ y z))) - |
849 with I1 I2 show ?thesis |
971 (\<Sum>(x, y, z)\<in>?S. ?XYZ x y z * log b (?XZ x z / (?X x * ?Z z)))" |
|
972 by (auto simp add: setsum_subtractf[symmetric] field_simps intro!: setsum_cong) |
|
973 also have "(\<Sum>(x, y, z)\<in>?S. ?XYZ x y z * log b (?XZ x z / (?X x * ?Z z))) = |
|
974 (\<Sum>(x, z)\<in>space MX \<times> space MZ. ?XZ x z * log b (?XZ x z / (?X x * ?Z z)))" |
|
975 unfolding setsum_cartesian_product[symmetric] setsum_commute[of _ _ "space MY"] |
|
976 setsum_left_distrib[symmetric] |
|
977 unfolding joint_distribution_commute_singleton[of X] |
|
978 unfolding joint_distribution_assoc_singleton[symmetric] |
|
979 using setsum_joint_distribution_singleton[OF finite_var(2) ZX] |
|
980 by (intro setsum_cong refl) (simp add: space_pair_measure) |
|
981 also have "(\<Sum>(x, y, z)\<in>?S. ?XYZ x y z * log b (?XYZ x y z / (?X x * ?YZ y z))) - |
|
982 (\<Sum>(x, z)\<in>space MX \<times> space MZ. ?XZ x z * log b (?XZ x z / (?X x * ?Z z))) = |
|
983 conditional_mutual_information b MX MY MZ X Y Z" |
|
984 unfolding conditional_mutual_information_def |
850 unfolding conditional_mutual_information_def |
985 unfolding mutual_information_generic_eq[OF finite_var(1,3)] |
851 apply (subst mi_eq) |
986 unfolding mutual_information_generic_eq[OF finite_var(1) YZ] |
852 apply (subst mutual_information_distr[OF S TP Px Pyz Pxyz]) |
987 by (simp add: space_sigma space_pair_measure setsum_cartesian_product') |
853 apply (subst integral_diff(2)[symmetric]) |
988 finally show ?thesis by simp |
854 apply (auto intro!: integral_cong_AE simp: split_beta' simp del: integral_diff) |
|
855 done |
989 qed |
856 qed |
990 |
857 |
991 lemma (in information_space) conditional_mutual_information_eq: |
858 lemma (in information_space) conditional_mutual_information_eq: |
992 assumes "simple_function M X" "simple_function M Y" "simple_function M Z" |
859 assumes Pz: "simple_distributed M Z Pz" |
993 shows "\<I>(X;Y|Z) = (\<Sum>(x, y, z) \<in> X`space M \<times> Y`space M \<times> Z`space M. |
860 assumes Pyz: "simple_distributed M (\<lambda>x. (Y x, Z x)) Pyz" |
994 distribution (\<lambda>x. (X x, Y x, Z x)) {(x, y, z)} * |
861 assumes Pxz: "simple_distributed M (\<lambda>x. (X x, Z x)) Pxz" |
995 log b (distribution (\<lambda>x. (X x, Y x, Z x)) {(x, y, z)} / |
862 assumes Pxyz: "simple_distributed M (\<lambda>x. (X x, Y x, Z x)) Pxyz" |
996 (joint_distribution X Z {(x, z)} * joint_distribution Y Z {(y,z)} / distribution Z {z})))" |
863 shows "\<I>(X ; Y | Z) = |
997 by (subst conditional_mutual_information_generic_eq[OF assms[THEN simple_function_imp_finite_random_variable]]) |
864 (\<Sum>(x, y, z)\<in>(\<lambda>x. (X x, Y x, Z x))`space M. Pxyz (x, y, z) * log b (Pxyz (x, y, z) / (Pxz (x, z) * (Pyz (y,z) / Pz z))))" |
998 simp |
865 proof (subst conditional_mutual_information_generic_eq[OF _ _ _ _ |
999 |
866 simple_distributed[OF Pz] simple_distributed_joint[OF Pyz] simple_distributed_joint[OF Pxz] |
1000 lemma (in information_space) conditional_mutual_information_eq_mutual_information: |
867 simple_distributed_joint2[OF Pxyz]]) |
1001 assumes X: "simple_function M X" and Y: "simple_function M Y" |
868 note simple_distributed_joint2_finite[OF Pxyz, simp] |
1002 shows "\<I>(X ; Y) = \<I>(X ; Y | (\<lambda>x. ()))" |
869 show "sigma_finite_measure (count_space (X ` space M))" |
1003 proof - |
870 by (simp add: sigma_finite_measure_count_space_finite) |
1004 have [simp]: "(\<lambda>x. ()) ` space M = {()}" using not_empty by auto |
871 show "sigma_finite_measure (count_space (Y ` space M))" |
1005 have C: "simple_function M (\<lambda>x. ())" by auto |
872 by (simp add: sigma_finite_measure_count_space_finite) |
1006 show ?thesis |
873 show "sigma_finite_measure (count_space (Z ` space M))" |
1007 unfolding conditional_mutual_information_eq[OF X Y C] |
874 by (simp add: sigma_finite_measure_count_space_finite) |
1008 unfolding mutual_information_eq[OF X Y] |
875 have "count_space (X ` space M) \<Otimes>\<^isub>M count_space (Y ` space M) \<Otimes>\<^isub>M count_space (Z ` space M) = |
1009 by (simp add: setsum_cartesian_product' distribution_remove_const) |
876 count_space (X`space M \<times> Y`space M \<times> Z`space M)" |
1010 qed |
877 (is "?P = ?C") |
1011 |
878 by (simp add: pair_measure_count_space) |
1012 lemma (in information_space) conditional_mutual_information_generic_positive: |
879 |
1013 assumes X: "finite_random_variable MX X" and Y: "finite_random_variable MY Y" and Z: "finite_random_variable MZ Z" |
880 let ?Px = "\<lambda>x. measure M (X -` {x} \<inter> space M)" |
1014 shows "0 \<le> conditional_mutual_information b MX MY MZ X Y Z" |
881 have "(\<lambda>x. (X x, Z x)) \<in> measurable M (count_space (X ` space M) \<Otimes>\<^isub>M count_space (Z ` space M))" |
1015 proof (cases "space MX \<times> space MY \<times> space MZ = {}") |
882 using simple_distributed_joint[OF Pxz] by (rule distributed_measurable) |
1016 case True show ?thesis |
883 from measurable_comp[OF this measurable_fst] |
1017 unfolding conditional_mutual_information_generic_eq[OF assms] True |
884 have "random_variable (count_space (X ` space M)) X" |
1018 by simp |
885 by (simp add: comp_def) |
1019 next |
886 then have "simple_function M X" |
1020 case False |
887 unfolding simple_function_def by auto |
1021 let ?dXYZ = "distribution (\<lambda>x. (X x, Y x, Z x))" |
888 then have "simple_distributed M X ?Px" |
1022 let ?dXZ = "joint_distribution X Z" |
889 by (rule simple_distributedI) auto |
1023 let ?dYZ = "joint_distribution Y Z" |
890 then show "distributed M (count_space (X ` space M)) X ?Px" |
1024 let ?dX = "distribution X" |
891 by (rule simple_distributed) |
1025 let ?dZ = "distribution Z" |
892 |
1026 let ?M = "space MX \<times> space MY \<times> space MZ" |
893 let ?f = "(\<lambda>x. if x \<in> (\<lambda>x. (X x, Y x, Z x)) ` space M then Pxyz x else 0)" |
1027 |
894 let ?g = "(\<lambda>x. if x \<in> (\<lambda>x. (Y x, Z x)) ` space M then Pyz x else 0)" |
1028 note YZ = finite_random_variable_pairI[OF Y Z] |
895 let ?h = "(\<lambda>x. if x \<in> (\<lambda>x. (X x, Z x)) ` space M then Pxz x else 0)" |
1029 note XZ = finite_random_variable_pairI[OF X Z] |
896 show |
1030 note ZX = finite_random_variable_pairI[OF Z X] |
897 "integrable ?P (\<lambda>(x, y, z). ?f (x, y, z) * log b (?f (x, y, z) / (?Px x * ?g (y, z))))" |
1031 note YZ = finite_random_variable_pairI[OF Y Z] |
898 "integrable ?P (\<lambda>(x, y, z). ?f (x, y, z) * log b (?h (x, z) / (?Px x * Pz z)))" |
1032 note XYZ = finite_random_variable_pairI[OF X YZ] |
899 by (auto intro!: integrable_count_space simp: pair_measure_count_space) |
1033 note finite = Z YZ XZ XYZ |
900 let ?i = "\<lambda>x y z. ?f (x, y, z) * log b (?f (x, y, z) / (?h (x, z) * (?g (y, z) / Pz z)))" |
1034 have order: "\<And>x y z. \<lbrakk>x \<in> space MX; y \<in> space MY; z \<in> space MZ; joint_distribution X Z {(x, z)} = 0\<rbrakk> |
901 let ?j = "\<lambda>x y z. Pxyz (x, y, z) * log b (Pxyz (x, y, z) / (Pxz (x, z) * (Pyz (y,z) / Pz z)))" |
1035 \<Longrightarrow> joint_distribution X (\<lambda>x. (Y x, Z x)) {(x, y, z)} = 0" |
902 have "(\<lambda>(x, y, z). ?i x y z) = (\<lambda>x. if x \<in> (\<lambda>x. (X x, Y x, Z x)) ` space M then ?j (fst x) (fst (snd x)) (snd (snd x)) else 0)" |
1036 unfolding joint_distribution_commute_singleton[of X] |
903 by (auto intro!: ext) |
1037 unfolding joint_distribution_assoc_singleton[symmetric] |
904 then show "(\<integral> (x, y, z). ?i x y z \<partial>?P) = (\<Sum>(x, y, z)\<in>(\<lambda>x. (X x, Y x, Z x)) ` space M. ?j x y z)" |
1038 using finite_distribution_order(6)[OF Y ZX] |
905 by (auto intro!: setsum_cong simp add: `?P = ?C` lebesgue_integral_count_space_finite simple_distributed_finite setsum_cases split_beta') |
1039 by auto |
906 qed |
1040 |
907 |
1041 note order = order |
908 lemma (in information_space) conditional_mutual_information_nonneg: |
1042 finite_distribution_order(5,6)[OF X YZ] |
909 assumes X: "simple_function M X" and Y: "simple_function M Y" and Z: "simple_function M Z" |
1043 finite_distribution_order(5,6)[OF Y Z] |
910 shows "0 \<le> \<I>(X ; Y | Z)" |
1044 |
911 proof - |
1045 have "- conditional_mutual_information b MX MY MZ X Y Z = - (\<Sum>(x, y, z) \<in> ?M. ?dXYZ {(x, y, z)} * |
912 def Pz \<equiv> "\<lambda>x. if x \<in> Z`space M then measure M (Z -` {x} \<inter> space M) else 0" |
1046 log b (?dXYZ {(x, y, z)} / (?dXZ {(x, z)} * ?dYZ {(y,z)} / ?dZ {z})))" |
913 def Pxz \<equiv> "\<lambda>x. if x \<in> (\<lambda>x. (X x, Z x))`space M then measure M ((\<lambda>x. (X x, Z x)) -` {x} \<inter> space M) else 0" |
1047 unfolding conditional_mutual_information_generic_eq[OF assms] neg_equal_iff_equal by auto |
914 def Pyz \<equiv> "\<lambda>x. if x \<in> (\<lambda>x. (Y x, Z x))`space M then measure M ((\<lambda>x. (Y x, Z x)) -` {x} \<inter> space M) else 0" |
1048 also have "\<dots> \<le> log b (\<Sum>(x, y, z) \<in> ?M. ?dXZ {(x, z)} * ?dYZ {(y,z)} / ?dZ {z})" |
915 def Pxyz \<equiv> "\<lambda>x. if x \<in> (\<lambda>x. (X x, Y x, Z x))`space M then measure M ((\<lambda>x. (X x, Y x, Z x)) -` {x} \<inter> space M) else 0" |
|
916 let ?M = "X`space M \<times> Y`space M \<times> Z`space M" |
|
917 |
|
918 note XZ = simple_function_Pair[OF X Z] |
|
919 note YZ = simple_function_Pair[OF Y Z] |
|
920 note XYZ = simple_function_Pair[OF X simple_function_Pair[OF Y Z]] |
|
921 have Pz: "simple_distributed M Z Pz" |
|
922 using Z by (rule simple_distributedI) (auto simp: Pz_def) |
|
923 have Pxz: "simple_distributed M (\<lambda>x. (X x, Z x)) Pxz" |
|
924 using XZ by (rule simple_distributedI) (auto simp: Pxz_def) |
|
925 have Pyz: "simple_distributed M (\<lambda>x. (Y x, Z x)) Pyz" |
|
926 using YZ by (rule simple_distributedI) (auto simp: Pyz_def) |
|
927 have Pxyz: "simple_distributed M (\<lambda>x. (X x, Y x, Z x)) Pxyz" |
|
928 using XYZ by (rule simple_distributedI) (auto simp: Pxyz_def) |
|
929 |
|
930 { fix z assume z: "z \<in> Z ` space M" then have "(\<Sum>x\<in>X ` space M. Pxz (x, z)) = Pz z" |
|
931 using distributed_marginal_eq_joint_simple[OF X Pz Pxz z] |
|
932 by (auto intro!: setsum_cong simp: Pxz_def) } |
|
933 note marginal1 = this |
|
934 |
|
935 { fix z assume z: "z \<in> Z ` space M" then have "(\<Sum>y\<in>Y ` space M. Pyz (y, z)) = Pz z" |
|
936 using distributed_marginal_eq_joint_simple[OF Y Pz Pyz z] |
|
937 by (auto intro!: setsum_cong simp: Pyz_def) } |
|
938 note marginal2 = this |
|
939 |
|
940 have "- \<I>(X ; Y | Z) = - (\<Sum>(x, y, z) \<in> ?M. Pxyz (x, y, z) * log b (Pxyz (x, y, z) / (Pxz (x, z) * (Pyz (y,z) / Pz z))))" |
|
941 unfolding conditional_mutual_information_eq[OF Pz Pyz Pxz Pxyz] |
|
942 using X Y Z by (auto intro!: setsum_mono_zero_left simp: Pxyz_def simple_functionD) |
|
943 also have "\<dots> \<le> log b (\<Sum>(x, y, z) \<in> ?M. Pxz (x, z) * (Pyz (y,z) / Pz z))" |
1049 unfolding split_beta' |
944 unfolding split_beta' |
1050 proof (rule log_setsum_divide) |
945 proof (rule log_setsum_divide) |
1051 show "?M \<noteq> {}" using False by simp |
946 show "?M \<noteq> {}" using not_empty by simp |
1052 show "1 < b" using b_gt_1 . |
947 show "1 < b" using b_gt_1 . |
1053 |
948 |
1054 show "finite ?M" using assms |
949 show "finite ?M" using X Y Z by (auto simp: simple_functionD) |
1055 unfolding finite_sigma_algebra_def finite_sigma_algebra_axioms_def by auto |
950 |
1056 |
951 then show "(\<Sum>x\<in>?M. Pxyz (fst x, fst (snd x), snd (snd x))) = 1" |
1057 show "(\<Sum>x\<in>?M. ?dXYZ {(fst x, fst (snd x), snd (snd x))}) = 1" |
952 apply (subst Pxyz[THEN simple_distributed_setsum_space, symmetric]) |
1058 unfolding setsum_cartesian_product' |
953 apply simp |
1059 unfolding setsum_commute[of _ "space MY"] |
954 apply (intro setsum_mono_zero_right) |
1060 unfolding setsum_commute[of _ "space MZ"] |
955 apply (auto simp: Pxyz_def) |
1061 by (simp_all add: space_pair_measure |
956 done |
1062 setsum_joint_distribution_singleton[OF X YZ] |
957 let ?N = "(\<lambda>x. (X x, Y x, Z x)) ` space M" |
1063 setsum_joint_distribution_singleton[OF Y Z] |
958 fix x assume x: "x \<in> ?M" |
1064 setsum_distribution[OF Z]) |
959 let ?Q = "Pxyz (fst x, fst (snd x), snd (snd x))" |
1065 |
960 let ?P = "Pxz (fst x, snd (snd x)) * (Pyz (fst (snd x), snd (snd x)) / Pz (snd (snd x)))" |
1066 fix x assume "x \<in> ?M" |
961 from x show "0 \<le> ?Q" "0 \<le> ?P" |
1067 let ?x = "(fst x, fst (snd x), snd (snd x))" |
962 using Pxyz[THEN simple_distributed, THEN distributed_real_AE] |
1068 |
963 using Pxz[THEN simple_distributed, THEN distributed_real_AE] |
1069 show "0 \<le> ?dXYZ {?x}" |
964 using Pyz[THEN simple_distributed, THEN distributed_real_AE] |
1070 "0 \<le> ?dXZ {(fst x, snd (snd x))} * ?dYZ {(fst (snd x), snd (snd x))} / ?dZ {snd (snd x)}" |
965 using Pz[THEN simple_distributed, THEN distributed_real_AE] |
1071 by (simp_all add: mult_nonneg_nonneg divide_nonneg_nonneg) |
966 by (auto intro!: mult_nonneg_nonneg divide_nonneg_nonneg simp: AE_count_space Pxyz_def Pxz_def Pyz_def Pz_def) |
1072 |
967 moreover assume "0 < ?Q" |
1073 assume *: "0 < ?dXYZ {?x}" |
968 moreover have "AE x in count_space ?N. Pz (snd (snd x)) = 0 \<longrightarrow> Pxyz x = 0" |
1074 with `x \<in> ?M` finite order show "0 < ?dXZ {(fst x, snd (snd x))} * ?dYZ {(fst (snd x), snd (snd x))} / ?dZ {snd (snd x)}" |
969 by (intro subdensity_real[of "\<lambda>x. snd (snd x)", OF _ Pxyz[THEN simple_distributed] Pz[THEN simple_distributed]]) (auto intro: measurable_snd') |
1075 by (cases x) (auto simp add: zero_le_mult_iff zero_le_divide_iff less_le) |
970 then have "\<And>x. Pz (snd (snd x)) = 0 \<longrightarrow> Pxyz x = 0" |
|
971 by (auto simp: Pz_def Pxyz_def AE_count_space) |
|
972 moreover have "AE x in count_space ?N. Pxz (fst x, snd (snd x)) = 0 \<longrightarrow> Pxyz x = 0" |
|
973 by (intro subdensity_real[of "\<lambda>x. (fst x, snd (snd x))", OF _ Pxyz[THEN simple_distributed] Pxz[THEN simple_distributed]]) (auto intro: measurable_Pair measurable_snd') |
|
974 then have "\<And>x. Pxz (fst x, snd (snd x)) = 0 \<longrightarrow> Pxyz x = 0" |
|
975 by (auto simp: Pz_def Pxyz_def AE_count_space) |
|
976 moreover have "AE x in count_space ?N. Pyz (snd x) = 0 \<longrightarrow> Pxyz x = 0" |
|
977 by (intro subdensity_real[of snd, OF _ Pxyz[THEN simple_distributed] Pyz[THEN simple_distributed]]) (auto intro: measurable_Pair) |
|
978 then have "\<And>x. Pyz (snd x) = 0 \<longrightarrow> Pxyz x = 0" |
|
979 by (auto simp: Pz_def Pxyz_def AE_count_space) |
|
980 ultimately show "0 < ?P" using x by (auto intro!: divide_pos_pos mult_pos_pos simp: less_le) |
1076 qed |
981 qed |
1077 also have "(\<Sum>(x, y, z) \<in> ?M. ?dXZ {(x, z)} * ?dYZ {(y,z)} / ?dZ {z}) = (\<Sum>z\<in>space MZ. ?dZ {z})" |
982 also have "(\<Sum>(x, y, z) \<in> ?M. Pxz (x, z) * (Pyz (y,z) / Pz z)) = (\<Sum>z\<in>Z`space M. Pz z)" |
1078 apply (simp add: setsum_cartesian_product') |
983 apply (simp add: setsum_cartesian_product') |
1079 apply (subst setsum_commute) |
984 apply (subst setsum_commute) |
1080 apply (subst (2) setsum_commute) |
985 apply (subst (2) setsum_commute) |
1081 by (auto simp: setsum_divide_distrib[symmetric] setsum_product[symmetric] |
986 apply (auto simp: setsum_divide_distrib[symmetric] setsum_product[symmetric] marginal1 marginal2 |
1082 setsum_joint_distribution_singleton[OF X Z] |
|
1083 setsum_joint_distribution_singleton[OF Y Z] |
|
1084 intro!: setsum_cong) |
987 intro!: setsum_cong) |
1085 also have "log b (\<Sum>z\<in>space MZ. ?dZ {z}) = 0" |
988 done |
1086 unfolding setsum_distribution[OF Z] by simp |
989 also have "log b (\<Sum>z\<in>Z`space M. Pz z) = 0" |
|
990 using Pz[THEN simple_distributed_setsum_space] by simp |
1087 finally show ?thesis by simp |
991 finally show ?thesis by simp |
1088 qed |
992 qed |
1089 |
993 |
1090 lemma (in information_space) conditional_mutual_information_positive: |
|
1091 assumes "simple_function M X" and "simple_function M Y" and "simple_function M Z" |
|
1092 shows "0 \<le> \<I>(X;Y|Z)" |
|
1093 by (rule conditional_mutual_information_generic_positive[OF assms[THEN simple_function_imp_finite_random_variable]]) |
|
1094 |
|
1095 subsection {* Conditional Entropy *} |
994 subsection {* Conditional Entropy *} |
1096 |
995 |
1097 definition (in prob_space) |
996 definition (in prob_space) |
1098 "conditional_entropy b S T X Y = conditional_mutual_information b S S T X X Y" |
997 "conditional_entropy b S T X Y = entropy b (S \<Otimes>\<^isub>M T) (\<lambda>x. (X x, Y x)) - entropy b T Y" |
1099 |
998 |
1100 abbreviation (in information_space) |
999 abbreviation (in information_space) |
1101 conditional_entropy_Pow ("\<H>'(_ | _')") where |
1000 conditional_entropy_Pow ("\<H>'(_ | _')") where |
1102 "\<H>(X | Y) \<equiv> conditional_entropy b |
1001 "\<H>(X | Y) \<equiv> conditional_entropy b (count_space (X`space M)) (count_space (Y`space M)) X Y" |
1103 \<lparr> space = X`space M, sets = Pow (X`space M), measure = ereal\<circ>distribution X \<rparr> |
|
1104 \<lparr> space = Y`space M, sets = Pow (Y`space M), measure = ereal\<circ>distribution Y \<rparr> X Y" |
|
1105 |
|
1106 lemma (in information_space) conditional_entropy_positive: |
|
1107 "simple_function M X \<Longrightarrow> simple_function M Y \<Longrightarrow> 0 \<le> \<H>(X | Y)" |
|
1108 unfolding conditional_entropy_def by (auto intro!: conditional_mutual_information_positive) |
|
1109 |
1002 |
1110 lemma (in information_space) conditional_entropy_generic_eq: |
1003 lemma (in information_space) conditional_entropy_generic_eq: |
1111 fixes MX :: "('c, 'd) measure_space_scheme" and MY :: "('e, 'f) measure_space_scheme" |
1004 fixes Px :: "'b \<Rightarrow> real" and Py :: "'c \<Rightarrow> real" |
1112 assumes MX: "finite_random_variable MX X" |
1005 assumes S: "sigma_finite_measure S" and T: "sigma_finite_measure T" |
1113 assumes MZ: "finite_random_variable MZ Z" |
1006 assumes Px: "distributed M S X Px" |
1114 shows "conditional_entropy b MX MZ X Z = |
1007 assumes Py: "distributed M T Y Py" |
1115 - (\<Sum>(x, z)\<in>space MX \<times> space MZ. |
1008 assumes Pxy: "distributed M (S \<Otimes>\<^isub>M T) (\<lambda>x. (X x, Y x)) Pxy" |
1116 joint_distribution X Z {(x, z)} * log b (joint_distribution X Z {(x, z)} / distribution Z {z}))" |
1009 assumes I1: "integrable (S \<Otimes>\<^isub>M T) (\<lambda>x. Pxy x * log b (Pxy x))" |
1117 proof - |
1010 assumes I2: "integrable (S \<Otimes>\<^isub>M T) (\<lambda>x. Pxy x * log b (Py (snd x)))" |
1118 interpret MX: finite_sigma_algebra MX using MX by simp |
1011 shows "conditional_entropy b S T X Y = - (\<integral>(x, y). Pxy (x, y) * log b (Pxy (x, y) / Py y) \<partial>(S \<Otimes>\<^isub>M T))" |
1119 interpret MZ: finite_sigma_algebra MZ using MZ by simp |
1012 proof - |
1120 let ?XXZ = "\<lambda>x y z. joint_distribution X (\<lambda>x. (X x, Z x)) {(x, y, z)}" |
1013 interpret S: sigma_finite_measure S by fact |
1121 let ?XZ = "\<lambda>x z. joint_distribution X Z {(x, z)}" |
1014 interpret T: sigma_finite_measure T by fact |
1122 let ?Z = "\<lambda>z. distribution Z {z}" |
1015 interpret ST: pair_sigma_finite S T .. |
1123 let ?f = "\<lambda>x y z. log b (?XXZ x y z * ?Z z / (?XZ x z * ?XZ y z))" |
1016 have ST: "sigma_finite_measure (S \<Otimes>\<^isub>M T)" .. |
1124 { fix x z have "?XXZ x x z = ?XZ x z" |
1017 |
1125 unfolding distribution_def by (auto intro!: arg_cong[where f=\<mu>']) } |
1018 interpret Pxy: prob_space "density (S \<Otimes>\<^isub>M T) Pxy" |
1126 note this[simp] |
1019 unfolding Pxy[THEN distributed_distr_eq_density, symmetric] |
1127 { fix x x' :: 'c and z assume "x' \<noteq> x" |
1020 using Pxy[THEN distributed_measurable] by (rule prob_space_distr) |
1128 then have "?XXZ x x' z = 0" |
1021 |
1129 by (auto simp: distribution_def empty_measure'[symmetric] |
1022 from Py Pxy have distr_eq: "distr M T Y = |
1130 simp del: empty_measure' intro!: arg_cong[where f=\<mu>']) } |
1023 distr (distr M (S \<Otimes>\<^isub>M T) (\<lambda>x. (X x, Y x))) T snd" |
1131 note this[simp] |
1024 by (subst distr_distr[OF measurable_snd]) (auto dest: distributed_measurable simp: comp_def) |
1132 { fix x x' z assume *: "x \<in> space MX" "z \<in> space MZ" |
1025 |
1133 then have "(\<Sum>x'\<in>space MX. ?XXZ x x' z * ?f x x' z) |
1026 have "entropy b T Y = - (\<integral>y. Py y * log b (Py y) \<partial>T)" |
1134 = (\<Sum>x'\<in>space MX. if x = x' then ?XZ x z * ?f x x z else 0)" |
1027 by (rule entropy_distr[OF T Py]) |
1135 by (auto intro!: setsum_cong) |
1028 also have "\<dots> = - (\<integral>(x,y). Pxy (x,y) * log b (Py y) \<partial>(S \<Otimes>\<^isub>M T))" |
1136 also have "\<dots> = ?XZ x z * ?f x x z" |
1029 using b_gt_1 Py[THEN distributed_real_measurable] |
1137 using `x \<in> space MX` by (simp add: setsum_cases[OF MX.finite_space]) |
1030 by (subst distributed_transform_integral[OF Pxy Py, where T=snd]) (auto intro!: integral_cong) |
1138 also have "\<dots> = ?XZ x z * log b (?Z z / ?XZ x z)" by auto |
1031 finally have e_eq: "entropy b T Y = - (\<integral>(x,y). Pxy (x,y) * log b (Py y) \<partial>(S \<Otimes>\<^isub>M T))" . |
1139 also have "\<dots> = - ?XZ x z * log b (?XZ x z / ?Z z)" |
1032 |
1140 using finite_distribution_order(6)[OF MX MZ] |
1033 have "AE x in S \<Otimes>\<^isub>M T. Px (fst x) = 0 \<longrightarrow> Pxy x = 0" |
1141 by (auto simp: log_simps field_simps zero_less_mult_iff) |
1034 by (intro subdensity_real[of fst, OF _ Pxy Px]) (auto intro: measurable_Pair) |
1142 finally have "(\<Sum>x'\<in>space MX. ?XXZ x x' z * ?f x x' z) = - ?XZ x z * log b (?XZ x z / ?Z z)" . } |
1035 moreover have "AE x in S \<Otimes>\<^isub>M T. Py (snd x) = 0 \<longrightarrow> Pxy x = 0" |
1143 note * = this |
1036 by (intro subdensity_real[of snd, OF _ Pxy Py]) (auto intro: measurable_Pair) |
|
1037 moreover have "AE x in S \<Otimes>\<^isub>M T. 0 \<le> Px (fst x)" |
|
1038 using Px by (intro ST.AE_pair_measure) (auto simp: comp_def intro!: measurable_fst'' dest: distributed_real_AE distributed_real_measurable) |
|
1039 moreover have "AE x in S \<Otimes>\<^isub>M T. 0 \<le> Py (snd x)" |
|
1040 using Py by (intro ST.AE_pair_measure) (auto simp: comp_def intro!: measurable_snd'' dest: distributed_real_AE distributed_real_measurable) |
|
1041 moreover note Pxy[THEN distributed_real_AE] |
|
1042 ultimately have pos: "AE x in S \<Otimes>\<^isub>M T. 0 \<le> Pxy x \<and> 0 \<le> Px (fst x) \<and> 0 \<le> Py (snd x) \<and> |
|
1043 (Pxy x = 0 \<or> (Pxy x \<noteq> 0 \<longrightarrow> 0 < Pxy x \<and> 0 < Px (fst x) \<and> 0 < Py (snd x)))" |
|
1044 by eventually_elim auto |
|
1045 |
|
1046 from pos have "AE x in S \<Otimes>\<^isub>M T. |
|
1047 Pxy x * log b (Pxy x) - Pxy x * log b (Py (snd x)) = Pxy x * log b (Pxy x / Py (snd x))" |
|
1048 by eventually_elim (auto simp: log_simps mult_pos_pos field_simps b_gt_1) |
|
1049 with I1 I2 show ?thesis |
|
1050 unfolding conditional_entropy_def |
|
1051 apply (subst e_eq) |
|
1052 apply (subst entropy_distr[OF ST Pxy]) |
|
1053 unfolding minus_diff_minus |
|
1054 apply (subst integral_diff(2)[symmetric]) |
|
1055 apply (auto intro!: integral_cong_AE simp: split_beta' simp del: integral_diff) |
|
1056 done |
|
1057 qed |
|
1058 |
|
1059 lemma (in information_space) conditional_entropy_eq: |
|
1060 assumes Y: "simple_distributed M Y Py" and X: "simple_function M X" |
|
1061 assumes XY: "simple_distributed M (\<lambda>x. (X x, Y x)) Pxy" |
|
1062 shows "\<H>(X | Y) = - (\<Sum>(x, y)\<in>(\<lambda>x. (X x, Y x)) ` space M. Pxy (x, y) * log b (Pxy (x, y) / Py y))" |
|
1063 proof (subst conditional_entropy_generic_eq[OF _ _ |
|
1064 simple_distributed[OF simple_distributedI[OF X refl]] simple_distributed[OF Y] simple_distributed_joint[OF XY]]) |
|
1065 have [simp]: "finite (X`space M)" using X by (simp add: simple_function_def) |
|
1066 note Y[THEN simple_distributed_finite, simp] |
|
1067 show "sigma_finite_measure (count_space (X ` space M))" |
|
1068 by (simp add: sigma_finite_measure_count_space_finite) |
|
1069 show "sigma_finite_measure (count_space (Y ` space M))" |
|
1070 by (simp add: sigma_finite_measure_count_space_finite) |
|
1071 let ?f = "(\<lambda>x. if x \<in> (\<lambda>x. (X x, Y x)) ` space M then Pxy x else 0)" |
|
1072 have "count_space (X ` space M) \<Otimes>\<^isub>M count_space (Y ` space M) = count_space (X`space M \<times> Y`space M)" |
|
1073 (is "?P = ?C") |
|
1074 using X Y by (simp add: simple_distributed_finite pair_measure_count_space) |
|
1075 with X Y show |
|
1076 "integrable ?P (\<lambda>x. ?f x * log b (?f x))" |
|
1077 "integrable ?P (\<lambda>x. ?f x * log b (Py (snd x)))" |
|
1078 by (auto intro!: integrable_count_space simp: simple_distributed_finite) |
|
1079 have eq: "(\<lambda>(x, y). ?f (x, y) * log b (?f (x, y) / Py y)) = |
|
1080 (\<lambda>x. if x \<in> (\<lambda>x. (X x, Y x)) ` space M then Pxy x * log b (Pxy x / Py (snd x)) else 0)" |
|
1081 by auto |
|
1082 from X Y show "- (\<integral> (x, y). ?f (x, y) * log b (?f (x, y) / Py y) \<partial>?P) = |
|
1083 - (\<Sum>(x, y)\<in>(\<lambda>x. (X x, Y x)) ` space M. Pxy (x, y) * log b (Pxy (x, y) / Py y))" |
|
1084 by (auto intro!: setsum_cong simp add: `?P = ?C` lebesgue_integral_count_space_finite simple_distributed_finite eq setsum_cases split_beta') |
|
1085 qed |
|
1086 |
|
1087 lemma (in information_space) conditional_mutual_information_eq_conditional_entropy: |
|
1088 assumes X: "simple_function M X" and Y: "simple_function M Y" |
|
1089 shows "\<I>(X ; X | Y) = \<H>(X | Y)" |
|
1090 proof - |
|
1091 def Py \<equiv> "\<lambda>x. if x \<in> Y`space M then measure M (Y -` {x} \<inter> space M) else 0" |
|
1092 def Pxy \<equiv> "\<lambda>x. if x \<in> (\<lambda>x. (X x, Y x))`space M then measure M ((\<lambda>x. (X x, Y x)) -` {x} \<inter> space M) else 0" |
|
1093 def Pxxy \<equiv> "\<lambda>x. if x \<in> (\<lambda>x. (X x, X x, Y x))`space M then measure M ((\<lambda>x. (X x, X x, Y x)) -` {x} \<inter> space M) else 0" |
|
1094 let ?M = "X`space M \<times> X`space M \<times> Y`space M" |
|
1095 |
|
1096 note XY = simple_function_Pair[OF X Y] |
|
1097 note XXY = simple_function_Pair[OF X XY] |
|
1098 have Py: "simple_distributed M Y Py" |
|
1099 using Y by (rule simple_distributedI) (auto simp: Py_def) |
|
1100 have Pxy: "simple_distributed M (\<lambda>x. (X x, Y x)) Pxy" |
|
1101 using XY by (rule simple_distributedI) (auto simp: Pxy_def) |
|
1102 have Pxxy: "simple_distributed M (\<lambda>x. (X x, X x, Y x)) Pxxy" |
|
1103 using XXY by (rule simple_distributedI) (auto simp: Pxxy_def) |
|
1104 have eq: "(\<lambda>x. (X x, X x, Y x)) ` space M = (\<lambda>(x, y). (x, x, y)) ` (\<lambda>x. (X x, Y x)) ` space M" |
|
1105 by auto |
|
1106 have inj: "\<And>A. inj_on (\<lambda>(x, y). (x, x, y)) A" |
|
1107 by (auto simp: inj_on_def) |
|
1108 have Pxxy_eq: "\<And>x y. Pxxy (x, x, y) = Pxy (x, y)" |
|
1109 by (auto simp: Pxxy_def Pxy_def intro!: arg_cong[where f=prob]) |
|
1110 have "AE x in count_space ((\<lambda>x. (X x, Y x))`space M). Py (snd x) = 0 \<longrightarrow> Pxy x = 0" |
|
1111 by (intro subdensity_real[of snd, OF _ Pxy[THEN simple_distributed] Py[THEN simple_distributed]]) (auto intro: measurable_Pair) |
|
1112 then show ?thesis |
|
1113 apply (subst conditional_mutual_information_eq[OF Py Pxy Pxy Pxxy]) |
|
1114 apply (subst conditional_entropy_eq[OF Py X Pxy]) |
|
1115 apply (auto intro!: setsum_cong simp: Pxxy_eq setsum_negf[symmetric] eq setsum_reindex[OF inj] |
|
1116 log_simps zero_less_mult_iff zero_le_mult_iff field_simps mult_less_0_iff AE_count_space) |
|
1117 using Py[THEN simple_distributed, THEN distributed_real_AE] Pxy[THEN simple_distributed, THEN distributed_real_AE] |
|
1118 apply (auto simp add: not_le[symmetric] AE_count_space) |
|
1119 done |
|
1120 qed |
|
1121 |
|
1122 lemma (in information_space) conditional_entropy_nonneg: |
|
1123 assumes X: "simple_function M X" and Y: "simple_function M Y" shows "0 \<le> \<H>(X | Y)" |
|
1124 using conditional_mutual_information_eq_conditional_entropy[OF X Y] conditional_mutual_information_nonneg[OF X X Y] |
|
1125 by simp |
|
1126 |
|
1127 subsection {* Equalities *} |
|
1128 |
|
1129 lemma (in information_space) mutual_information_eq_entropy_conditional_entropy_distr: |
|
1130 fixes Px :: "'b \<Rightarrow> real" and Py :: "'c \<Rightarrow> real" and Pxy :: "('b \<times> 'c) \<Rightarrow> real" |
|
1131 assumes S: "sigma_finite_measure S" and T: "sigma_finite_measure T" |
|
1132 assumes Px: "distributed M S X Px" and Py: "distributed M T Y Py" |
|
1133 assumes Pxy: "distributed M (S \<Otimes>\<^isub>M T) (\<lambda>x. (X x, Y x)) Pxy" |
|
1134 assumes Ix: "integrable(S \<Otimes>\<^isub>M T) (\<lambda>x. Pxy x * log b (Px (fst x)))" |
|
1135 assumes Iy: "integrable(S \<Otimes>\<^isub>M T) (\<lambda>x. Pxy x * log b (Py (snd x)))" |
|
1136 assumes Ixy: "integrable(S \<Otimes>\<^isub>M T) (\<lambda>x. Pxy x * log b (Pxy x))" |
|
1137 shows "mutual_information b S T X Y = entropy b S X + entropy b T Y - entropy b (S \<Otimes>\<^isub>M T) (\<lambda>x. (X x, Y x))" |
|
1138 proof - |
|
1139 have X: "entropy b S X = - (\<integral>x. Pxy x * log b (Px (fst x)) \<partial>(S \<Otimes>\<^isub>M T))" |
|
1140 using b_gt_1 Px[THEN distributed_real_measurable] |
|
1141 apply (subst entropy_distr[OF S Px]) |
|
1142 apply (subst distributed_transform_integral[OF Pxy Px, where T=fst]) |
|
1143 apply (auto intro!: integral_cong) |
|
1144 done |
|
1145 |
|
1146 have Y: "entropy b T Y = - (\<integral>x. Pxy x * log b (Py (snd x)) \<partial>(S \<Otimes>\<^isub>M T))" |
|
1147 using b_gt_1 Py[THEN distributed_real_measurable] |
|
1148 apply (subst entropy_distr[OF T Py]) |
|
1149 apply (subst distributed_transform_integral[OF Pxy Py, where T=snd]) |
|
1150 apply (auto intro!: integral_cong) |
|
1151 done |
|
1152 |
|
1153 interpret S: sigma_finite_measure S by fact |
|
1154 interpret T: sigma_finite_measure T by fact |
|
1155 interpret ST: pair_sigma_finite S T .. |
|
1156 have ST: "sigma_finite_measure (S \<Otimes>\<^isub>M T)" .. |
|
1157 |
|
1158 have XY: "entropy b (S \<Otimes>\<^isub>M T) (\<lambda>x. (X x, Y x)) = - (\<integral>x. Pxy x * log b (Pxy x) \<partial>(S \<Otimes>\<^isub>M T))" |
|
1159 by (subst entropy_distr[OF ST Pxy]) (auto intro!: integral_cong) |
|
1160 |
|
1161 have "AE x in S \<Otimes>\<^isub>M T. Px (fst x) = 0 \<longrightarrow> Pxy x = 0" |
|
1162 by (intro subdensity_real[of fst, OF _ Pxy Px]) (auto intro: measurable_Pair) |
|
1163 moreover have "AE x in S \<Otimes>\<^isub>M T. Py (snd x) = 0 \<longrightarrow> Pxy x = 0" |
|
1164 by (intro subdensity_real[of snd, OF _ Pxy Py]) (auto intro: measurable_Pair) |
|
1165 moreover have "AE x in S \<Otimes>\<^isub>M T. 0 \<le> Px (fst x)" |
|
1166 using Px by (intro ST.AE_pair_measure) (auto simp: comp_def intro!: measurable_fst'' dest: distributed_real_AE distributed_real_measurable) |
|
1167 moreover have "AE x in S \<Otimes>\<^isub>M T. 0 \<le> Py (snd x)" |
|
1168 using Py by (intro ST.AE_pair_measure) (auto simp: comp_def intro!: measurable_snd'' dest: distributed_real_AE distributed_real_measurable) |
|
1169 moreover note Pxy[THEN distributed_real_AE] |
|
1170 ultimately have "AE x in S \<Otimes>\<^isub>M T. Pxy x * log b (Pxy x) - Pxy x * log b (Px (fst x)) - Pxy x * log b (Py (snd x)) = |
|
1171 Pxy x * log b (Pxy x / (Px (fst x) * Py (snd x)))" |
|
1172 (is "AE x in _. ?f x = ?g x") |
|
1173 proof eventually_elim |
|
1174 case (goal1 x) |
|
1175 show ?case |
|
1176 proof cases |
|
1177 assume "Pxy x \<noteq> 0" |
|
1178 with goal1 have "0 < Px (fst x)" "0 < Py (snd x)" "0 < Pxy x" |
|
1179 by auto |
|
1180 then show ?thesis |
|
1181 using b_gt_1 by (simp add: log_simps mult_pos_pos less_imp_le field_simps) |
|
1182 qed simp |
|
1183 qed |
|
1184 |
|
1185 have "entropy b S X + entropy b T Y - entropy b (S \<Otimes>\<^isub>M T) (\<lambda>x. (X x, Y x)) = integral\<^isup>L (S \<Otimes>\<^isub>M T) ?f" |
|
1186 unfolding X Y XY |
|
1187 apply (subst integral_diff) |
|
1188 apply (intro integral_diff Ixy Ix Iy)+ |
|
1189 apply (subst integral_diff) |
|
1190 apply (intro integral_diff Ixy Ix Iy)+ |
|
1191 apply (simp add: field_simps) |
|
1192 done |
|
1193 also have "\<dots> = integral\<^isup>L (S \<Otimes>\<^isub>M T) ?g" |
|
1194 using `AE x in _. ?f x = ?g x` by (rule integral_cong_AE) |
|
1195 also have "\<dots> = mutual_information b S T X Y" |
|
1196 by (rule mutual_information_distr[OF S T Px Py Pxy, symmetric]) |
|
1197 finally show ?thesis .. |
|
1198 qed |
|
1199 |
|
1200 lemma (in information_space) mutual_information_eq_entropy_conditional_entropy: |
|
1201 assumes sf_X: "simple_function M X" and sf_Y: "simple_function M Y" |
|
1202 shows "\<I>(X ; Y) = \<H>(X) - \<H>(X | Y)" |
|
1203 proof - |
|
1204 have X: "simple_distributed M X (\<lambda>x. measure M (X -` {x} \<inter> space M))" |
|
1205 using sf_X by (rule simple_distributedI) auto |
|
1206 have Y: "simple_distributed M Y (\<lambda>x. measure M (Y -` {x} \<inter> space M))" |
|
1207 using sf_Y by (rule simple_distributedI) auto |
|
1208 have sf_XY: "simple_function M (\<lambda>x. (X x, Y x))" |
|
1209 using sf_X sf_Y by (rule simple_function_Pair) |
|
1210 then have XY: "simple_distributed M (\<lambda>x. (X x, Y x)) (\<lambda>x. measure M ((\<lambda>x. (X x, Y x)) -` {x} \<inter> space M))" |
|
1211 by (rule simple_distributedI) auto |
|
1212 from simple_distributed_joint_finite[OF this, simp] |
|
1213 have eq: "count_space (X ` space M) \<Otimes>\<^isub>M count_space (Y ` space M) = count_space (X ` space M \<times> Y ` space M)" |
|
1214 by (simp add: pair_measure_count_space) |
|
1215 |
|
1216 have "\<I>(X ; Y) = \<H>(X) + \<H>(Y) - entropy b (count_space (X`space M) \<Otimes>\<^isub>M count_space (Y`space M)) (\<lambda>x. (X x, Y x))" |
|
1217 using sigma_finite_measure_count_space_finite sigma_finite_measure_count_space_finite simple_distributed[OF X] simple_distributed[OF Y] simple_distributed_joint[OF XY] |
|
1218 by (rule mutual_information_eq_entropy_conditional_entropy_distr) (auto simp: eq integrable_count_space) |
|
1219 then show ?thesis |
|
1220 unfolding conditional_entropy_def by simp |
|
1221 qed |
|
1222 |
|
1223 lemma (in information_space) mutual_information_nonneg_simple: |
|
1224 assumes sf_X: "simple_function M X" and sf_Y: "simple_function M Y" |
|
1225 shows "0 \<le> \<I>(X ; Y)" |
|
1226 proof - |
|
1227 have X: "simple_distributed M X (\<lambda>x. measure M (X -` {x} \<inter> space M))" |
|
1228 using sf_X by (rule simple_distributedI) auto |
|
1229 have Y: "simple_distributed M Y (\<lambda>x. measure M (Y -` {x} \<inter> space M))" |
|
1230 using sf_Y by (rule simple_distributedI) auto |
|
1231 |
|
1232 have sf_XY: "simple_function M (\<lambda>x. (X x, Y x))" |
|
1233 using sf_X sf_Y by (rule simple_function_Pair) |
|
1234 then have XY: "simple_distributed M (\<lambda>x. (X x, Y x)) (\<lambda>x. measure M ((\<lambda>x. (X x, Y x)) -` {x} \<inter> space M))" |
|
1235 by (rule simple_distributedI) auto |
|
1236 |
|
1237 from simple_distributed_joint_finite[OF this, simp] |
|
1238 have eq: "count_space (X ` space M) \<Otimes>\<^isub>M count_space (Y ` space M) = count_space (X ` space M \<times> Y ` space M)" |
|
1239 by (simp add: pair_measure_count_space) |
|
1240 |
1144 show ?thesis |
1241 show ?thesis |
1145 unfolding conditional_entropy_def |
1242 by (rule mutual_information_nonneg[OF _ _ simple_distributed[OF X] simple_distributed[OF Y] simple_distributed_joint[OF XY]]) |
1146 unfolding conditional_mutual_information_generic_eq[OF MX MX MZ] |
1243 (simp_all add: eq integrable_count_space sigma_finite_measure_count_space_finite) |
1147 by (auto simp: setsum_cartesian_product' setsum_negf[symmetric] |
|
1148 setsum_commute[of _ "space MZ"] * |
|
1149 intro!: setsum_cong) |
|
1150 qed |
|
1151 |
|
1152 lemma (in information_space) conditional_entropy_eq: |
|
1153 assumes "simple_function M X" "simple_function M Z" |
|
1154 shows "\<H>(X | Z) = |
|
1155 - (\<Sum>(x, z)\<in>X ` space M \<times> Z ` space M. |
|
1156 joint_distribution X Z {(x, z)} * |
|
1157 log b (joint_distribution X Z {(x, z)} / distribution Z {z}))" |
|
1158 by (subst conditional_entropy_generic_eq[OF assms[THEN simple_function_imp_finite_random_variable]]) |
|
1159 simp |
|
1160 |
|
1161 lemma (in information_space) conditional_entropy_eq_ce_with_hypothesis: |
|
1162 assumes X: "simple_function M X" and Y: "simple_function M Y" |
|
1163 shows "\<H>(X | Y) = |
|
1164 -(\<Sum>y\<in>Y`space M. distribution Y {y} * |
|
1165 (\<Sum>x\<in>X`space M. joint_distribution X Y {(x,y)} / distribution Y {(y)} * |
|
1166 log b (joint_distribution X Y {(x,y)} / distribution Y {(y)})))" |
|
1167 unfolding conditional_entropy_eq[OF assms] |
|
1168 using finite_distribution_order(5,6)[OF assms[THEN simple_function_imp_finite_random_variable]] |
|
1169 by (auto simp: setsum_cartesian_product' setsum_commute[of _ "Y`space M"] setsum_right_distrib |
|
1170 intro!: setsum_cong) |
|
1171 |
|
1172 lemma (in information_space) conditional_entropy_eq_cartesian_product: |
|
1173 assumes "simple_function M X" "simple_function M Y" |
|
1174 shows "\<H>(X | Y) = -(\<Sum>x\<in>X`space M. \<Sum>y\<in>Y`space M. |
|
1175 joint_distribution X Y {(x,y)} * |
|
1176 log b (joint_distribution X Y {(x,y)} / distribution Y {y}))" |
|
1177 unfolding conditional_entropy_eq[OF assms] |
|
1178 by (auto intro!: setsum_cong simp: setsum_cartesian_product') |
|
1179 |
|
1180 subsection {* Equalities *} |
|
1181 |
|
1182 lemma (in information_space) mutual_information_eq_entropy_conditional_entropy: |
|
1183 assumes X: "simple_function M X" and Z: "simple_function M Z" |
|
1184 shows "\<I>(X ; Z) = \<H>(X) - \<H>(X | Z)" |
|
1185 proof - |
|
1186 let ?XZ = "\<lambda>x z. joint_distribution X Z {(x, z)}" |
|
1187 let ?Z = "\<lambda>z. distribution Z {z}" |
|
1188 let ?X = "\<lambda>x. distribution X {x}" |
|
1189 note fX = X[THEN simple_function_imp_finite_random_variable] |
|
1190 note fZ = Z[THEN simple_function_imp_finite_random_variable] |
|
1191 note finite_distribution_order[OF fX fZ, simp] |
|
1192 { fix x z assume "x \<in> X`space M" "z \<in> Z`space M" |
|
1193 have "?XZ x z * log b (?XZ x z / (?X x * ?Z z)) = |
|
1194 ?XZ x z * log b (?XZ x z / ?Z z) - ?XZ x z * log b (?X x)" |
|
1195 by (auto simp: log_simps zero_le_mult_iff field_simps less_le) } |
|
1196 note * = this |
|
1197 show ?thesis |
|
1198 unfolding entropy_eq[OF X] conditional_entropy_eq[OF X Z] mutual_information_eq[OF X Z] |
|
1199 using setsum_joint_distribution_singleton[OF fZ fX, unfolded joint_distribution_commute_singleton[of Z X]] |
|
1200 by (simp add: * setsum_cartesian_product' setsum_subtractf setsum_left_distrib[symmetric] |
|
1201 setsum_distribution) |
|
1202 qed |
1244 qed |
1203 |
1245 |
1204 lemma (in information_space) conditional_entropy_less_eq_entropy: |
1246 lemma (in information_space) conditional_entropy_less_eq_entropy: |
1205 assumes X: "simple_function M X" and Z: "simple_function M Z" |
1247 assumes X: "simple_function M X" and Z: "simple_function M Z" |
1206 shows "\<H>(X | Z) \<le> \<H>(X)" |
1248 shows "\<H>(X | Z) \<le> \<H>(X)" |
1207 proof - |
1249 proof - |
1208 have "\<I>(X ; Z) = \<H>(X) - \<H>(X | Z)" using mutual_information_eq_entropy_conditional_entropy[OF assms] . |
1250 have "0 \<le> \<I>(X ; Z)" using X Z by (rule mutual_information_nonneg_simple) |
1209 with mutual_information_positive[OF X Z] entropy_positive[OF X] |
1251 also have "\<I>(X ; Z) = \<H>(X) - \<H>(X | Z)" using mutual_information_eq_entropy_conditional_entropy[OF assms] . |
1210 show ?thesis by auto |
1252 finally show ?thesis by auto |
1211 qed |
1253 qed |
1212 |
1254 |
1213 lemma (in information_space) entropy_chain_rule: |
1255 lemma (in information_space) entropy_chain_rule: |
1214 assumes X: "simple_function M X" and Y: "simple_function M Y" |
1256 assumes X: "simple_function M X" and Y: "simple_function M Y" |
1215 shows "\<H>(\<lambda>x. (X x, Y x)) = \<H>(X) + \<H>(Y|X)" |
1257 shows "\<H>(\<lambda>x. (X x, Y x)) = \<H>(X) + \<H>(Y|X)" |
1216 proof - |
1258 proof - |
1217 let ?XY = "\<lambda>x y. joint_distribution X Y {(x, y)}" |
1259 note XY = simple_distributedI[OF simple_function_Pair[OF X Y] refl] |
1218 let ?Y = "\<lambda>y. distribution Y {y}" |
1260 note YX = simple_distributedI[OF simple_function_Pair[OF Y X] refl] |
1219 let ?X = "\<lambda>x. distribution X {x}" |
1261 note simple_distributed_joint_finite[OF this, simp] |
1220 note fX = X[THEN simple_function_imp_finite_random_variable] |
1262 let ?f = "\<lambda>x. prob ((\<lambda>x. (X x, Y x)) -` {x} \<inter> space M)" |
1221 note fY = Y[THEN simple_function_imp_finite_random_variable] |
1263 let ?g = "\<lambda>x. prob ((\<lambda>x. (Y x, X x)) -` {x} \<inter> space M)" |
1222 note finite_distribution_order[OF fX fY, simp] |
1264 let ?h = "\<lambda>x. if x \<in> (\<lambda>x. (Y x, X x)) ` space M then prob ((\<lambda>x. (Y x, X x)) -` {x} \<inter> space M) else 0" |
1223 { fix x y assume "x \<in> X`space M" "y \<in> Y`space M" |
1265 have "\<H>(\<lambda>x. (X x, Y x)) = - (\<Sum>x\<in>(\<lambda>x. (X x, Y x)) ` space M. ?f x * log b (?f x))" |
1224 have "?XY x y * log b (?XY x y / ?X x) = |
1266 using XY by (rule entropy_simple_distributed) |
1225 ?XY x y * log b (?XY x y) - ?XY x y * log b (?X x)" |
1267 also have "\<dots> = - (\<Sum>x\<in>(\<lambda>(x, y). (y, x)) ` (\<lambda>x. (X x, Y x)) ` space M. ?g x * log b (?g x))" |
1226 by (auto simp: log_simps zero_le_mult_iff field_simps less_le) } |
1268 by (subst (2) setsum_reindex) (auto simp: inj_on_def intro!: setsum_cong arg_cong[where f="\<lambda>A. prob A * log b (prob A)"]) |
1227 note * = this |
1269 also have "\<dots> = - (\<Sum>x\<in>(\<lambda>x. (Y x, X x)) ` space M. ?h x * log b (?h x))" |
|
1270 by (auto intro!: setsum_cong) |
|
1271 also have "\<dots> = entropy b (count_space (Y ` space M) \<Otimes>\<^isub>M count_space (X ` space M)) (\<lambda>x. (Y x, X x))" |
|
1272 by (subst entropy_distr[OF _ simple_distributed_joint[OF YX]]) |
|
1273 (auto simp: pair_measure_count_space sigma_finite_measure_count_space_finite lebesgue_integral_count_space_finite |
|
1274 cong del: setsum_cong intro!: setsum_mono_zero_left) |
|
1275 finally have "\<H>(\<lambda>x. (X x, Y x)) = entropy b (count_space (Y ` space M) \<Otimes>\<^isub>M count_space (X ` space M)) (\<lambda>x. (Y x, X x))" . |
|
1276 then show ?thesis |
|
1277 unfolding conditional_entropy_def by simp |
|
1278 qed |
|
1279 |
|
1280 lemma (in information_space) entropy_partition: |
|
1281 assumes X: "simple_function M X" |
|
1282 shows "\<H>(X) = \<H>(f \<circ> X) + \<H>(X|f \<circ> X)" |
|
1283 proof - |
|
1284 note fX = simple_function_compose[OF X, of f] |
|
1285 have eq: "(\<lambda>x. ((f \<circ> X) x, X x)) ` space M = (\<lambda>x. (f x, x)) ` X ` space M" by auto |
|
1286 have inj: "\<And>A. inj_on (\<lambda>x. (f x, x)) A" |
|
1287 by (auto simp: inj_on_def) |
1228 show ?thesis |
1288 show ?thesis |
1229 using setsum_joint_distribution_singleton[OF fY fX] |
1289 apply (subst entropy_chain_rule[symmetric, OF fX X]) |
1230 unfolding entropy_eq[OF X] conditional_entropy_eq_cartesian_product[OF Y X] entropy_eq_cartesian_product[OF X Y] |
1290 apply (subst entropy_simple_distributed[OF simple_distributedI[OF simple_function_Pair[OF fX X] refl]]) |
1231 unfolding joint_distribution_commute_singleton[of Y X] setsum_commute[of _ "X`space M"] |
1291 apply (subst entropy_simple_distributed[OF simple_distributedI[OF X refl]]) |
1232 by (simp add: * setsum_subtractf setsum_left_distrib[symmetric]) |
1292 unfolding eq |
1233 qed |
1293 apply (subst setsum_reindex[OF inj]) |
1234 |
1294 apply (auto intro!: setsum_cong arg_cong[where f="\<lambda>A. prob A * log b (prob A)"]) |
1235 section {* Partitioning *} |
1295 done |
1236 |
|
1237 definition "subvimage A f g \<longleftrightarrow> (\<forall>x \<in> A. f -` {f x} \<inter> A \<subseteq> g -` {g x} \<inter> A)" |
|
1238 |
|
1239 lemma subvimageI: |
|
1240 assumes "\<And>x y. \<lbrakk> x \<in> A ; y \<in> A ; f x = f y \<rbrakk> \<Longrightarrow> g x = g y" |
|
1241 shows "subvimage A f g" |
|
1242 using assms unfolding subvimage_def by blast |
|
1243 |
|
1244 lemma subvimageE[consumes 1]: |
|
1245 assumes "subvimage A f g" |
|
1246 obtains "\<And>x y. \<lbrakk> x \<in> A ; y \<in> A ; f x = f y \<rbrakk> \<Longrightarrow> g x = g y" |
|
1247 using assms unfolding subvimage_def by blast |
|
1248 |
|
1249 lemma subvimageD: |
|
1250 "\<lbrakk> subvimage A f g ; x \<in> A ; y \<in> A ; f x = f y \<rbrakk> \<Longrightarrow> g x = g y" |
|
1251 using assms unfolding subvimage_def by blast |
|
1252 |
|
1253 lemma subvimage_subset: |
|
1254 "\<lbrakk> subvimage B f g ; A \<subseteq> B \<rbrakk> \<Longrightarrow> subvimage A f g" |
|
1255 unfolding subvimage_def by auto |
|
1256 |
|
1257 lemma subvimage_idem[intro]: "subvimage A g g" |
|
1258 by (safe intro!: subvimageI) |
|
1259 |
|
1260 lemma subvimage_comp_finer[intro]: |
|
1261 assumes svi: "subvimage A g h" |
|
1262 shows "subvimage A g (f \<circ> h)" |
|
1263 proof (rule subvimageI, simp) |
|
1264 fix x y assume "x \<in> A" "y \<in> A" "g x = g y" |
|
1265 from svi[THEN subvimageD, OF this] |
|
1266 show "f (h x) = f (h y)" by simp |
|
1267 qed |
|
1268 |
|
1269 lemma subvimage_comp_gran: |
|
1270 assumes svi: "subvimage A g h" |
|
1271 assumes inj: "inj_on f (g ` A)" |
|
1272 shows "subvimage A (f \<circ> g) h" |
|
1273 by (rule subvimageI) (auto intro!: subvimageD[OF svi] simp: inj_on_iff[OF inj]) |
|
1274 |
|
1275 lemma subvimage_comp: |
|
1276 assumes svi: "subvimage (f ` A) g h" |
|
1277 shows "subvimage A (g \<circ> f) (h \<circ> f)" |
|
1278 by (rule subvimageI) (auto intro!: svi[THEN subvimageD]) |
|
1279 |
|
1280 lemma subvimage_trans: |
|
1281 assumes fg: "subvimage A f g" |
|
1282 assumes gh: "subvimage A g h" |
|
1283 shows "subvimage A f h" |
|
1284 by (rule subvimageI) (auto intro!: fg[THEN subvimageD] gh[THEN subvimageD]) |
|
1285 |
|
1286 lemma subvimage_translator: |
|
1287 assumes svi: "subvimage A f g" |
|
1288 shows "\<exists>h. \<forall>x \<in> A. h (f x) = g x" |
|
1289 proof (safe intro!: exI[of _ "\<lambda>x. (THE z. z \<in> (g ` (f -` {x} \<inter> A)))"]) |
|
1290 fix x assume "x \<in> A" |
|
1291 show "(THE x'. x' \<in> (g ` (f -` {f x} \<inter> A))) = g x" |
|
1292 by (rule theI2[of _ "g x"]) |
|
1293 (insert `x \<in> A`, auto intro!: svi[THEN subvimageD]) |
|
1294 qed |
|
1295 |
|
1296 lemma subvimage_translator_image: |
|
1297 assumes svi: "subvimage A f g" |
|
1298 shows "\<exists>h. h ` f ` A = g ` A" |
|
1299 proof - |
|
1300 from subvimage_translator[OF svi] |
|
1301 obtain h where "\<And>x. x \<in> A \<Longrightarrow> h (f x) = g x" by auto |
|
1302 thus ?thesis |
|
1303 by (auto intro!: exI[of _ h] |
|
1304 simp: image_compose[symmetric] comp_def cong: image_cong) |
|
1305 qed |
|
1306 |
|
1307 lemma subvimage_finite: |
|
1308 assumes svi: "subvimage A f g" and fin: "finite (f`A)" |
|
1309 shows "finite (g`A)" |
|
1310 proof - |
|
1311 from subvimage_translator_image[OF svi] |
|
1312 obtain h where "g`A = h`f`A" by fastforce |
|
1313 with fin show "finite (g`A)" by simp |
|
1314 qed |
|
1315 |
|
1316 lemma subvimage_disj: |
|
1317 assumes svi: "subvimage A f g" |
|
1318 shows "f -` {x} \<inter> A \<subseteq> g -` {y} \<inter> A \<or> |
|
1319 f -` {x} \<inter> g -` {y} \<inter> A = {}" (is "?sub \<or> ?dist") |
|
1320 proof (rule disjCI) |
|
1321 assume "\<not> ?dist" |
|
1322 then obtain z where "z \<in> A" and "x = f z" and "y = g z" by auto |
|
1323 thus "?sub" using svi unfolding subvimage_def by auto |
|
1324 qed |
|
1325 |
|
1326 lemma setsum_image_split: |
|
1327 assumes svi: "subvimage A f g" and fin: "finite (f ` A)" |
|
1328 shows "(\<Sum>x\<in>f`A. h x) = (\<Sum>y\<in>g`A. \<Sum>x\<in>f`(g -` {y} \<inter> A). h x)" |
|
1329 (is "?lhs = ?rhs") |
|
1330 proof - |
|
1331 have "f ` A = |
|
1332 snd ` (SIGMA x : g ` A. f ` (g -` {x} \<inter> A))" |
|
1333 (is "_ = snd ` ?SIGMA") |
|
1334 unfolding image_split_eq_Sigma[symmetric] |
|
1335 by (simp add: image_compose[symmetric] comp_def) |
|
1336 moreover |
|
1337 have snd_inj: "inj_on snd ?SIGMA" |
|
1338 unfolding image_split_eq_Sigma[symmetric] |
|
1339 by (auto intro!: inj_onI subvimageD[OF svi]) |
|
1340 ultimately |
|
1341 have "(\<Sum>x\<in>f`A. h x) = (\<Sum>(x,y)\<in>?SIGMA. h y)" |
|
1342 by (auto simp: setsum_reindex intro: setsum_cong) |
|
1343 also have "... = ?rhs" |
|
1344 using subvimage_finite[OF svi fin] fin |
|
1345 apply (subst setsum_Sigma[symmetric]) |
|
1346 by (auto intro!: finite_subset[of _ "f`A"]) |
|
1347 finally show ?thesis . |
|
1348 qed |
|
1349 |
|
1350 lemma (in information_space) entropy_partition: |
|
1351 assumes sf: "simple_function M X" "simple_function M P" |
|
1352 assumes svi: "subvimage (space M) X P" |
|
1353 shows "\<H>(X) = \<H>(P) + \<H>(X|P)" |
|
1354 proof - |
|
1355 let ?XP = "\<lambda>x p. joint_distribution X P {(x, p)}" |
|
1356 let ?X = "\<lambda>x. distribution X {x}" |
|
1357 let ?P = "\<lambda>p. distribution P {p}" |
|
1358 note fX = sf(1)[THEN simple_function_imp_finite_random_variable] |
|
1359 note fP = sf(2)[THEN simple_function_imp_finite_random_variable] |
|
1360 note finite_distribution_order[OF fX fP, simp] |
|
1361 have "(\<Sum>x\<in>X ` space M. ?X x * log b (?X x)) = |
|
1362 (\<Sum>y\<in>P `space M. \<Sum>x\<in>X ` space M. ?XP x y * log b (?XP x y))" |
|
1363 proof (subst setsum_image_split[OF svi], |
|
1364 safe intro!: setsum_mono_zero_cong_left imageI) |
|
1365 show "finite (X ` space M)" "finite (X ` space M)" "finite (P ` space M)" |
|
1366 using sf unfolding simple_function_def by auto |
|
1367 next |
|
1368 fix p x assume in_space: "p \<in> space M" "x \<in> space M" |
|
1369 assume "?XP (X x) (P p) * log b (?XP (X x) (P p)) \<noteq> 0" |
|
1370 hence "(\<lambda>x. (X x, P x)) -` {(X x, P p)} \<inter> space M \<noteq> {}" by (auto simp: distribution_def) |
|
1371 with svi[unfolded subvimage_def, rule_format, OF `x \<in> space M`] |
|
1372 show "x \<in> P -` {P p}" by auto |
|
1373 next |
|
1374 fix p x assume in_space: "p \<in> space M" "x \<in> space M" |
|
1375 assume "P x = P p" |
|
1376 from this[symmetric] svi[unfolded subvimage_def, rule_format, OF `x \<in> space M`] |
|
1377 have "X -` {X x} \<inter> space M \<subseteq> P -` {P p} \<inter> space M" |
|
1378 by auto |
|
1379 hence "(\<lambda>x. (X x, P x)) -` {(X x, P p)} \<inter> space M = X -` {X x} \<inter> space M" |
|
1380 by auto |
|
1381 thus "?X (X x) * log b (?X (X x)) = ?XP (X x) (P p) * log b (?XP (X x) (P p))" |
|
1382 by (auto simp: distribution_def) |
|
1383 qed |
|
1384 moreover have "\<And>x y. ?XP x y * log b (?XP x y / ?P y) = |
|
1385 ?XP x y * log b (?XP x y) - ?XP x y * log b (?P y)" |
|
1386 by (auto simp add: log_simps zero_less_mult_iff field_simps) |
|
1387 ultimately show ?thesis |
|
1388 unfolding sf[THEN entropy_eq] conditional_entropy_eq[OF sf] |
|
1389 using setsum_joint_distribution_singleton[OF fX fP] |
|
1390 by (simp add: setsum_cartesian_product' setsum_subtractf setsum_distribution |
|
1391 setsum_left_distrib[symmetric] setsum_commute[where B="P`space M"]) |
|
1392 qed |
1296 qed |
1393 |
1297 |
1394 corollary (in information_space) entropy_data_processing: |
1298 corollary (in information_space) entropy_data_processing: |
1395 assumes X: "simple_function M X" shows "\<H>(f \<circ> X) \<le> \<H>(X)" |
1299 assumes X: "simple_function M X" shows "\<H>(f \<circ> X) \<le> \<H>(X)" |
1396 proof - |
1300 proof - |
1397 note X |
1301 note fX = simple_function_compose[OF X, of f] |
1398 moreover have fX: "simple_function M (f \<circ> X)" using X by auto |
1302 from X have "\<H>(X) = \<H>(f\<circ>X) + \<H>(X|f\<circ>X)" by (rule entropy_partition) |
1399 moreover have "subvimage (space M) X (f \<circ> X)" by auto |
|
1400 ultimately have "\<H>(X) = \<H>(f\<circ>X) + \<H>(X|f\<circ>X)" by (rule entropy_partition) |
|
1401 then show "\<H>(f \<circ> X) \<le> \<H>(X)" |
1303 then show "\<H>(f \<circ> X) \<le> \<H>(X)" |
1402 by (auto intro: conditional_entropy_positive[OF X fX]) |
1304 by (auto intro: conditional_entropy_nonneg[OF X fX]) |
1403 qed |
1305 qed |
1404 |
1306 |
1405 corollary (in information_space) entropy_of_inj: |
1307 corollary (in information_space) entropy_of_inj: |
1406 assumes X: "simple_function M X" and inj: "inj_on f (X`space M)" |
1308 assumes X: "simple_function M X" and inj: "inj_on f (X`space M)" |
1407 shows "\<H>(f \<circ> X) = \<H>(X)" |
1309 shows "\<H>(f \<circ> X) = \<H>(X)" |