From 0d07bc1d383cdc5009a671855f3704f1b3322fc8 Mon Sep 17 00:00:00 2001 From: Vikas Rawal Date: Fri, 29 Nov 2019 09:18:34 +0530 Subject: [PATCH] distribution of sample mean with unknown population variance --- acjlecturesday2.org | 239 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 238 insertions(+), 1 deletion(-) diff --git a/acjlecturesday2.org b/acjlecturesday2.org index bd8f189..2df9d95 100644 --- a/acjlecturesday2.org +++ b/acjlecturesday2.org @@ -44,7 +44,9 @@ Source: [[https://www.nytimes.com/2018/11/02/opinion/the-perversion-of-fiscal-po #+END_SRC -** Sampling Distributions :slide: +** Sampling Distributions + +*** Sampling Distributions :slide: #+RESULTS: sampling2 [[file:bsample2.png]] @@ -142,4 +144,239 @@ Source: [[https://www.nytimes.com/2018/11/02/opinion/the-perversion-of-fiscal-po p #+end_src +*** Sampling Distributions :slide: + ++ $Standard.error = \frac{\sigma}{\sqrt{mean}}$ + + + +| Standard deviation of population ($\sigma$) | 130 | +| Standard errors of samples of size | | +| 5 | 58 | +| 20 | 29 | +| 50 | 18 | +| 200 | 9 | + + + + +** Introduction to Hypothesis Testing +*** Transforming the Distribution to Standard Normal :slide: + +#+RESULTS: sampling3 +[[file:bsample3.png]] + +#+NAME: sampling3 +#+BEGIN_SRC R :results output graphics :exports results :file bsample3.png :width 2500 :height 2000 :res 300 + library(data.table) + readRDS("plfsdata/plfsacjdata.rds")->worker + worker$standardwage->worker$wage + c(1:nrow(worker))->worker$SamplingFrameOrder + worker[sex!=3,]->worker + library(ggplot2) + + worker->t9 + (t9$wage-mean(t9$wage))/sqrt(var(t9$wage))->t9$wage + ggplot(t9,aes(wage))+geom_density(colour="black",size=1)->p + p+scale_y_continuous(limits=c(0,0.75))->p + p+scale_x_continuous(limits=c(-15,15) + ,breaks=c(-5,0,mean(worker$wage),10,15))->p + p+theme_bw()->p + p + + + + sample(1:nrow(worker),5, replace=FALSE)->a1 + worker[a1,]->s1 + mean(s1$wage)->t1 + for (i in c(1:9999)) { + sample(1:nrow(worker),5, replace=FALSE)->a1 + worker[a1,]->s1 + c(t1,mean(s1$wage))->t1 + } + + data.frame(sno=c(1:10000),meancol=(t1-mean(worker$wage))/sqrt(var(t1)))->t1 + p+geom_density(data=t1,aes(meancol),colour="blue",size=1)-> p + p + + sample(1:nrow(worker),20, replace=FALSE)->a1 + worker[a1,]->s1 + mean(s1$wage)->t0 + for (i in c(1:9999)) { + sample(1:nrow(worker),20, replace=FALSE)->a1 + worker[a1,]->s1 + c(t0,mean(s1$wage))->t0 + } + + data.frame(sno=c(1:10000),meancol=(t0-mean(worker$wage))/sqrt(var(t0)))->t0 + p+geom_density(data=t0,aes(meancol),colour="darkolivegreen",size=1)-> p + p + + sample(1:nrow(worker),50, replace=FALSE)->a1 + worker[a1,]->s1 + mean(s1$wage)->t + for (i in c(1:9999)) { + sample(1:nrow(worker),50, replace=FALSE)->a1 + worker[a1,]->s1 + c(t,mean(s1$wage))->t + } + + data.frame(sno=c(1:10000),meancol=(t-mean(worker$wage))/sqrt(var(t)))->t + p+geom_density(data=t,aes(meancol),colour="red",size=1)-> p + p + + sample(1:nrow(worker),200, replace=FALSE)->a1 + worker[a1,]->s1 + mean(s1$wage)->t4 + for (i in c(1:9999)) { + sample(1:nrow(worker),200, replace=FALSE)->a1 + worker[a1,]->s1 + c(t4,mean(s1$wage))->t4 + } + + data.frame(sno=c(1:10000),meancol=(t4-mean(worker$wage))/sqrt(var(t4)))->t4 + p+geom_density(data=t4,aes(meancol),colour="pink",size=1)-> p + p +#+end_src + + +*** But in real situations we do not know the population variance! :slide: + +#+RESULTS: sampling5 +[[file:bsample5.png]] + +#+NAME: sampling5 +#+BEGIN_SRC R :results output graphics :exports results :file bsample5.png :width 3500 :height 2000 :res 300 + library(data.table) + library(ggplot2) + options(scipen=9999) + readRDS("plfsdata/plfsacjdata.rds")->worker + worker$standardwage->worker$wage + c(1:nrow(worker))->worker$SamplingFrameOrder + worker[sex!=3,]->worker + + worker->t9 + (t9$wage-mean(t9$wage))/sqrt(var(t9$wage))->t9$wage + ggplot(t9,aes(wage))+geom_density(colour="black",size=1)->p + p+scale_y_continuous(limits=c(0,0.75))->p + p+scale_x_continuous(limits=c(-15,15) + ,breaks=c(-15,0,round(mean(worker$wage)),15))->p + p+theme_bw()->p + p + + data.frame(sno=c(),meancol=c(),sterr=c())->t4 + samplesize=10 + for (i in c(1:20000)) { + sample(1:nrow(worker),samplesize, replace=FALSE)->a1 + worker[a1,]->s1 + rbind(t4,data.frame( + sno=i, + meancol=mean(s1$wage), + sterr=sqrt(var(s1$wage))/sqrt(samplesize) + ) + )->t4 + } + + (t4$meancol)/t4$sterr->t4$teststat + (t4$meancol)/sqrt(var(t4$meancol))->t4$teststat2 + data.frame(modelt=rt(200000,samplesize-1,ncp=mean(t4$teststat)),modelnorm=rnorm(200000,mean=mean(t4$teststat2)))->m + + var(t4$teststat) + var(m$modelt) + var(m$modelnorm) + var(t4$teststat2) + mean(t4$teststat) + mean(m$modelt) + mean(m$modelnorm) + mean(t4$teststat2) + + ggplot()->p + p+geom_density(data=t4,aes(teststat2),colour="red",size=1)-> p + p+geom_density(data=m,aes(modelnorm),colour="black",size=1)->p + p+geom_density(data=t4,aes(teststat),colour="blue",size=1)-> p + p+geom_density(data=m,aes(modelt),colour="darkolivegreen",size=1)->p + p+annotate("text",x=-30,y=0.42, + label=paste("Normal distribution, with standard deviation",round(sqrt(var(m$modelnorm)),2)), + colour="black",hjust=0)->p + p+annotate("text",x=-30,y=0.40, + label=paste("Statistic with known population variance, standard error =", + round(sqrt(var(t4$teststat2)),2)), + colour="red",hjust=0)->p + p+annotate("text",x=-30,y=0.38, + label=paste("t distribution, with standard deviation =",round(sqrt(var(m$modelt)),2)), + colour="darkolivegreen",hjust=0)->p + p+annotate("text",x=-30,y=0.36, + label=paste("Statistic with unknown population variance, standard error =", + round(sqrt(var(t4$teststat)),2)), + colour="blue",hjust=0)->p + p+scale_x_continuous(limits=c(-30,30))+theme_bw()->p + p +#+end_src + + +*** Introduction to the t distribution :ignore: + +#+RESULTS: sampling4 +[[file:bsample4.png]] + +#+NAME: sampling4 +#+BEGIN_SRC R :results output graphics :exports results :file bsample4.png :width 2500 :height 2000 :res 300 + library(data.table) + library(ggplot2) + options(scipen=9999) + readRDS("plfsdata/plfsacjdata.rds")->worker + worker$standardwage->worker$wage + c(1:nrow(worker))->worker$SamplingFrameOrder + worker[sex!=3,]->worker + + worker->t9 + (t9$wage-mean(t9$wage))/sqrt(var(t9$wage))->t9$wage + ggplot(t9,aes(wage))+geom_density(colour="black",size=1)->p + p+scale_y_continuous(limits=c(0,0.75))->p + p+scale_x_continuous(limits=c(-15,15) + ,breaks=c(-15,0,round(mean(worker$wage)),15))->p + p+theme_bw()->p + p + + data.frame(sno=c(),meancol=c(),sterr=c())->t4 + samplesize=50 + for (i in c(1:20000)) { + sample(1:nrow(worker),samplesize, replace=FALSE)->a1 + worker[a1,]->s1 + rbind(t4,data.frame( + sno=i, + meancol=mean(s1$wage), + sterr=sqrt(var(s1$wage))/sqrt(samplesize)))->t4 + } + + (t4$meancol-mean(t4$meancol))/t4$sterr->t4$teststat + (t4$meancol-mean(t4$meancol))/sqrt(var(t4$meancol))->t4$teststat2 + data.frame(modelt=rt(20000,29))->m + + var(t4$teststat) + var(m$modelt) + var(t4$teststat2) + + ggplot()->p + p+geom_density(data=t4,aes(teststat),colour="blue",size=1)-> p + p+geom_density(data=m,aes(modelt),colour="darkolivegreen",size=1)->p + p+geom_density(data=t4,aes(teststat2),colour="red",size=1)-> p + p+annotate("text",x=3,y=0.4, + label=paste("Var of statistic with unknown variance:", + round(var(t4$teststat),2)), + colour="blue")->p + p+annotate("text",x=3,y=0.39, + label=paste("Var of statistic with known variance:", + round(var(t4$teststat2),2)), + colour="red")->p + p+annotate("text",x=3,y=0.38, + label=paste("Var of t-distribution:",round(var(m$modelt),2)), + colour="darkolivegreen")->p + p + + +#+end_src + + +