diff --git a/.gitignore b/.gitignore index 768990e..e22a10e 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,7 @@ !acjlectures.org !acjlecturesday1.org !acjlecturesday2.org +!associations.org !.gitignore !.gitattributes !graphics diff --git a/README.md b/README.md index 9492ccd..d29c655 100644 --- a/README.md +++ b/README.md @@ -3,4 +3,5 @@ ## [Introduction](../master/index.org) ## [Descriptive Statistics](../master/descriptive-statistics.org) ## [Introduction to Statistical Inference](../master/statistical-inference.org) +## [Correlation and Regressions](../master/associations.org) diff --git a/associations.org b/associations.org new file mode 100644 index 0000000..8a41522 --- /dev/null +++ b/associations.org @@ -0,0 +1,233 @@ +#+TITLE: Correlation and Regressions +#+PROPERTY: header-args:R :session acj :eval never-export +#+STARTUP: hideall inlineimages hideblocks +#+HTML_HEAD: + + +#+NAME: sassociation1 +#+BEGIN_SRC R :results output list org :exports results + library(data.table) + readRDS("plfsdata/plfsacjdata.rds")->worker + worker$standardwage->worker$wage + factor(worker$social_group)->worker$social_group + factor(worker$religion)->worker$religion + factor(as.numeric(worker$state))->worker$state + factor(worker$sector)->worker$sector + + cor.test(worker$wage,worker$years_edu) + cor.test(worker$wage,worker$age) +#+end_src + +#+RESULTS: sassociation1 +#+begin_src org +- Pearson's product-moment correlation +- data: worker$wage and worker$years_edu +- t = 35.998, df = 37633, p-value < 0.00000000000000022 +- alternative hypothesis: true correlation is not equal to 0 +- 95 percent confidence interval: +- 0.1726625 0.1921962 +- sample estimates: +- cor +- 0.1824473 +- Pearson's product-moment correlation +- data: worker$wage and worker$age +- t = 9.3777, df = 37633, p-value < 0.00000000000000022 +- alternative hypothesis: true correlation is not equal to 0 +- 95 percent confidence interval: +- 0.03819950 0.05835859 +- sample estimates: +- cor +- 0.04828396 +#+end_src + + +#+NAME: sassociation2 +#+BEGIN_SRC R :results output list org :exports results + lm(wage~sex+age+years_edu, + data=worker)->t + summary(t) + + lm(wage~sex+age+years_edu+sector+social_group+religion+quarter, + data=worker)->t + summary(t) + + lm(wage~sex+age+years_edu+ + sector+social_group+religion+quarter+state, + data=worker)->t + summary(t) + +#+end_src + +#+RESULTS: sassociation2 +#+begin_src org +- Call: +- lm(formula = wage ~ sex +- age +- years_edu, data = worker) +- Residuals: +- Min 1Q Median 3Q Max +- -1638.7 -489.5 -72.1 437.6 12305.1 +- Coefficients: +- Estimate Std. Error t value Pr(>|t|) +- (Intercept) 2185.0021 19.6473 111.211 < 0.0000000000000002 *** +- sex -667.9011 9.5525 -69.919 < 0.0000000000000002 *** +- age 1.9781 0.3152 6.276 0.000000000352 *** +- years_edu 10.7387 0.8999 11.933 < 0.0000000000000002 *** +- --- +- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 +- Residual standard error: 695.3 on 37631 degrees of freedom +- Multiple R-squared: 0.1343, Adjusted R-squared: 0.1342 +- F-statistic: 1946 on 3 and 37631 DF, p-value: < 0.00000000000000022 +- Call: +- lm(formula = wage ~ sex +- age +- years_edu +- sector +- social_group +- religion +- quarter, data = worker) +- Residuals: +- Min 1Q Median 3Q Max +- -1789.3 -484.2 -60.3 432.2 12337.4 +- Coefficients: +- Estimate Std. Error t value Pr(>|t|) +- (Intercept) 1716.2498 23.9940 71.528 < 0.0000000000000002 *** +- sex -621.1712 9.4569 -65.684 < 0.0000000000000002 *** +- age 1.5207 0.3096 4.912 0.00000090742386603 *** +- years_edu 8.0624 0.8860 9.100 < 0.0000000000000002 *** +- sector 211.8919 8.0576 26.297 < 0.0000000000000002 *** +- social_group 15.1192 1.3090 11.550 < 0.0000000000000002 *** +- religion 24.2759 3.0235 8.029 0.00000000000000101 *** +- quarterQ2 33.0336 11.4750 2.879 0.00399 ** +- quarterQ3 42.1100 10.9479 3.846 0.00012 *** +- quarterQ4 60.3340 10.7247 5.626 0.00000001860438745 *** +- quarterQ5 96.8388 13.8317 7.001 0.00000000000257946 *** +- --- +- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 +- Residual standard error: 682.3 on 37624 degrees of freedom +- Multiple R-squared: 0.1665, Adjusted R-squared: 0.1662 +- F-statistic: 751.4 on 10 and 37624 DF, p-value: < 0.00000000000000022 +- Call: +- lm(formula = wage ~ sex +- age +- years_edu +- sector +- social_group +- religion +- quarter +- state, data = worker) +- Residuals: +- Min 1Q Median 3Q Max +- -2166.6 -420.9 -26.9 393.6 11885.4 +- Coefficients: +- Estimate Std. Error t value Pr(>|t|) +- (Intercept) 2334.9216 31.6980 73.661 < 0.0000000000000002 *** +- sex -600.1791 8.9602 -66.983 < 0.0000000000000002 *** +- age -0.7958 0.2925 -2.720 0.006522 ** +- years_edu 3.5156 0.8463 4.154 0.0000327426267356 *** +- sector 170.5218 7.7194 22.090 < 0.0000000000000002 *** +- social_group 11.0980 1.2958 8.565 < 0.0000000000000002 *** +- religion 6.8266 3.0199 2.261 0.023793 * +- quarterQ2 27.5969 10.5232 2.622 0.008733 ** +- quarterQ3 47.4106 10.0603 4.713 0.0000024541348868 *** +- quarterQ4 65.0550 9.8696 6.591 0.0000000000441218 *** +- quarterQ5 111.5467 12.9159 8.636 < 0.0000000000000002 *** +- state02 -182.7089 36.7495 -4.972 0.0000006664834133 *** +- state03 -302.4308 31.4091 -9.629 < 0.0000000000000002 *** +- state04 54.6670 157.9356 0.346 0.729243 +- state05 -364.2626 49.7310 -7.325 0.0000000000002443 *** +- state06 -158.2256 34.1364 -4.635 0.0000035796630662 *** +- state07 103.8315 85.9533 1.208 0.227056 +- state08 -232.7281 31.6792 -7.346 0.0000000000002077 *** +- state09 -625.4561 26.6275 -23.489 < 0.0000000000000002 *** +- state1 -329.4467 43.8025 -7.521 0.0000000000000555 *** +- state10 -228.9421 27.2168 -8.412 < 0.0000000000000002 *** +- state11 229.4834 90.4784 2.536 0.011206 * +- state12 8.6292 53.5113 0.161 0.871889 +- state13 -414.2133 108.2687 -3.826 0.000131 *** +- state14 24.9990 33.2053 0.753 0.451537 +- state15 -386.4679 38.6430 -10.001 < 0.0000000000000002 *** +- state16 -408.9313 35.7746 -11.431 < 0.0000000000000002 *** +- state17 -169.2716 39.6051 -4.274 0.0000192485172076 *** +- state18 -191.4328 31.9025 -6.001 0.0000000019842884 *** +- state19 -849.3718 25.4194 -33.414 < 0.0000000000000002 *** +- state2 -127.5145 72.9047 -1.749 0.080289 . +- state20 -627.1074 29.7862 -21.054 < 0.0000000000000002 *** +- state21 -739.1066 28.7611 -25.698 < 0.0000000000000002 *** +- state22 -873.9305 30.4180 -28.731 < 0.0000000000000002 *** +- state23 -803.9339 26.7202 -30.087 < 0.0000000000000002 *** +- state24 -541.8985 29.4421 -18.406 < 0.0000000000000002 *** +- state25 57.8958 313.5190 0.185 0.853493 +- state26 -211.9977 199.1455 -1.065 0.287093 +- state27 -594.3999 24.9000 -23.872 < 0.0000000000000002 *** +- state28 -381.2620 26.4571 -14.411 < 0.0000000000000002 *** +- state29 -367.1243 26.4633 -13.873 < 0.0000000000000002 *** +- state3 -479.1579 40.3655 -11.870 < 0.0000000000000002 *** +- state30 63.4353 74.2422 0.854 0.392869 +- state31 157.6998 89.1517 1.769 0.076920 . +- state32 38.5378 26.3576 1.462 0.143717 +- state33 -423.9537 25.3135 -16.748 < 0.0000000000000002 *** +- state34 -419.1475 40.5708 -10.331 < 0.0000000000000002 *** +- state35 416.7570 57.3598 7.266 0.0000000000003784 *** +- state36 -31.7199 29.3147 -1.082 0.279238 +- state4 371.0493 182.0114 2.039 0.041496 * +- state5 -599.6303 58.8906 -10.182 < 0.0000000000000002 *** +- state6 -197.8508 55.0537 -3.594 0.000326 *** +- state7 253.3238 116.6312 2.172 0.029861 * +- state8 -291.8271 42.4682 -6.872 0.0000000000064448 *** +- state9 -734.4680 34.6586 -21.192 < 0.0000000000000002 *** +- --- +- Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1 +- Residual standard error: 625.2 on 37580 degrees of freedom +- Multiple R-squared: 0.301, Adjusted R-squared: 0.3 +- F-statistic: 299.7 on 54 and 37580 DF, p-value: < 0.00000000000000022 +#+end_src + +#+NAME: sassociation3 +#+BEGIN_SRC R :results output graphics :exports results :file bsample2.png :width 2500 :height 1500 :res 300 + library(data.table) + readRDS("plfsdata/plfsacjdata.rds")->worker + worker$standardwage->worker$wage + factor(worker$social_group)->worker$social_group + factor(worker$religion)->worker$religion + factor(worker$state)->worker$state + factor(worker$sector)->worker$sector + worker->t9 + lm(wage~sex+age+years_edu+sector+social_group+religion+quarter+state,data=t9)->t + lm(log(wage)~sex+age+years_edu+sector+social_group+religion+quarter+state,data=t9)->t2 + data.frame(yvar=t9$wage,residuals=residuals(t),variable="model1")->a + rbind(a,data.frame(yvar=log(t9$wage),residuals=residuals(t2),variable="model2"))->a + ggplot(a,aes(x=residuals,y=yvar,group=variable))->p + p+geom_point()+facet_wrap(.~variable,scales="free") +#+end_src + +#+RESULTS: sassociation3 +[[file:bsample2.png]] + +#+NAME: roughwork +#+BEGIN_SRC R :results output list org :exports results + worker->t + t[,years_edu:=as.numeric(years_edu)] + t[years_edu==0,category:=3] + t[years_edu>0&years_edu<12,category:=2] + t[is.na(category),category:=1] + + + ifelse(t$years_edu==0,1, + ifelse(t$years_edu<12,2,3))->t$category + + t[sex!=3,.(length(person_no)),.(category,sex)]->t + t[,prop:=V1/sum(V1),sex] + t +#+end_src + +#+RESULTS: roughwork +#+begin_src org +- category sex V1 prop +- 1: 1 2 3697 0.52995986 +- 2: 1 1 7066 0.23051577 +- 3: 2 2 3047 0.43678326 +- 4: 2 1 20363 0.66430692 +- 5: 3 1 3224 0.10517731 +- 6: 3 2 232 0.03325688 +#+end_src diff --git a/bsample2.png b/bsample2.png index d5eae24..80d499e 100644 --- a/bsample2.png +++ b/bsample2.png @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3a1bdd5e69d2c9d15bac557524888eb11f802d48433b1ef7f786f5c99cda8319 -size 175327 +oid sha256:e7522598facab12f8be78b796dc0bed898581a56b4233fe62b1fd3e25a13e79d +size 91825 diff --git a/statistical-inference.org b/statistical-inference.org index 0d9682b..5033e26 100644 --- a/statistical-inference.org +++ b/statistical-inference.org @@ -325,15 +325,9 @@ $H_{a}: \mu_{women} \neq \mu_{men}$ #+RESULTS: ttest2 #+begin_src org -- Welch Two Sample t-test -- data: wage by sex -- t = 79.02, df = 13483, p-value < 0.00000000000000022 -- alternative hypothesis: true difference in means is not equal to 0 -- 95 percent confidence interval: -- 104.6563 109.9805 -- sample estimates: -- mean in group 1 mean in group 2 -- 310.8974 203.5790 +- Error in subset(worker, sex != 3) : object 'worker' not found +- Error in factor(t9$sex) : object 't9' not found +- Error in eval(m$data, parent.frame()) : object 't9' not found #+end_src