Added associations.org

master
Vikas Rawal 6 years ago
parent 66e49313e6
commit 60e8528de3

1
.gitignore vendored

@ -3,6 +3,7 @@
!acjlectures.org
!acjlecturesday1.org
!acjlecturesday2.org
!associations.org
!.gitignore
!.gitattributes
!graphics

@ -3,4 +3,5 @@
## [Introduction](../master/index.org)
## [Descriptive Statistics](../master/descriptive-statistics.org)
## [Introduction to Statistical Inference](../master/statistical-inference.org)
## [Correlation and Regressions](../master/associations.org)

@ -0,0 +1,233 @@
#+TITLE: Correlation and Regressions
#+PROPERTY: header-args:R :session acj :eval never-export
#+STARTUP: hideall inlineimages hideblocks
#+HTML_HEAD: <style>#content{max-width:1200px;} </style>
#+NAME: sassociation1
#+BEGIN_SRC R :results output list org :exports results
library(data.table)
readRDS("plfsdata/plfsacjdata.rds")->worker
worker$standardwage->worker$wage
factor(worker$social_group)->worker$social_group
factor(worker$religion)->worker$religion
factor(as.numeric(worker$state))->worker$state
factor(worker$sector)->worker$sector
cor.test(worker$wage,worker$years_edu)
cor.test(worker$wage,worker$age)
#+end_src
#+RESULTS: sassociation1
#+begin_src org
- Pearson's product-moment correlation
- data: worker$wage and worker$years_edu
- t = 35.998, df = 37633, p-value < 0.00000000000000022
- alternative hypothesis: true correlation is not equal to 0
- 95 percent confidence interval:
- 0.1726625 0.1921962
- sample estimates:
- cor
- 0.1824473
- Pearson's product-moment correlation
- data: worker$wage and worker$age
- t = 9.3777, df = 37633, p-value < 0.00000000000000022
- alternative hypothesis: true correlation is not equal to 0
- 95 percent confidence interval:
- 0.03819950 0.05835859
- sample estimates:
- cor
- 0.04828396
#+end_src
#+NAME: sassociation2
#+BEGIN_SRC R :results output list org :exports results
lm(wage~sex+age+years_edu,
data=worker)->t
summary(t)
lm(wage~sex+age+years_edu+sector+social_group+religion+quarter,
data=worker)->t
summary(t)
lm(wage~sex+age+years_edu+
sector+social_group+religion+quarter+state,
data=worker)->t
summary(t)
#+end_src
#+RESULTS: sassociation2
#+begin_src org
- Call:
- lm(formula = wage ~ sex
- age
- years_edu, data = worker)
- Residuals:
- Min 1Q Median 3Q Max
- -1638.7 -489.5 -72.1 437.6 12305.1
- Coefficients:
- Estimate Std. Error t value Pr(>|t|)
- (Intercept) 2185.0021 19.6473 111.211 < 0.0000000000000002 ***
- sex -667.9011 9.5525 -69.919 < 0.0000000000000002 ***
- age 1.9781 0.3152 6.276 0.000000000352 ***
- years_edu 10.7387 0.8999 11.933 < 0.0000000000000002 ***
- ---
- Signif. codes: 0 *** 0.001 ** 0.01 * 0.05 . 0.1 1
- Residual standard error: 695.3 on 37631 degrees of freedom
- Multiple R-squared: 0.1343, Adjusted R-squared: 0.1342
- F-statistic: 1946 on 3 and 37631 DF, p-value: < 0.00000000000000022
- Call:
- lm(formula = wage ~ sex
- age
- years_edu
- sector
- social_group
- religion
- quarter, data = worker)
- Residuals:
- Min 1Q Median 3Q Max
- -1789.3 -484.2 -60.3 432.2 12337.4
- Coefficients:
- Estimate Std. Error t value Pr(>|t|)
- (Intercept) 1716.2498 23.9940 71.528 < 0.0000000000000002 ***
- sex -621.1712 9.4569 -65.684 < 0.0000000000000002 ***
- age 1.5207 0.3096 4.912 0.00000090742386603 ***
- years_edu 8.0624 0.8860 9.100 < 0.0000000000000002 ***
- sector 211.8919 8.0576 26.297 < 0.0000000000000002 ***
- social_group 15.1192 1.3090 11.550 < 0.0000000000000002 ***
- religion 24.2759 3.0235 8.029 0.00000000000000101 ***
- quarterQ2 33.0336 11.4750 2.879 0.00399 **
- quarterQ3 42.1100 10.9479 3.846 0.00012 ***
- quarterQ4 60.3340 10.7247 5.626 0.00000001860438745 ***
- quarterQ5 96.8388 13.8317 7.001 0.00000000000257946 ***
- ---
- Signif. codes: 0 *** 0.001 ** 0.01 * 0.05 . 0.1 1
- Residual standard error: 682.3 on 37624 degrees of freedom
- Multiple R-squared: 0.1665, Adjusted R-squared: 0.1662
- F-statistic: 751.4 on 10 and 37624 DF, p-value: < 0.00000000000000022
- Call:
- lm(formula = wage ~ sex
- age
- years_edu
- sector
- social_group
- religion
- quarter
- state, data = worker)
- Residuals:
- Min 1Q Median 3Q Max
- -2166.6 -420.9 -26.9 393.6 11885.4
- Coefficients:
- Estimate Std. Error t value Pr(>|t|)
- (Intercept) 2334.9216 31.6980 73.661 < 0.0000000000000002 ***
- sex -600.1791 8.9602 -66.983 < 0.0000000000000002 ***
- age -0.7958 0.2925 -2.720 0.006522 **
- years_edu 3.5156 0.8463 4.154 0.0000327426267356 ***
- sector 170.5218 7.7194 22.090 < 0.0000000000000002 ***
- social_group 11.0980 1.2958 8.565 < 0.0000000000000002 ***
- religion 6.8266 3.0199 2.261 0.023793 *
- quarterQ2 27.5969 10.5232 2.622 0.008733 **
- quarterQ3 47.4106 10.0603 4.713 0.0000024541348868 ***
- quarterQ4 65.0550 9.8696 6.591 0.0000000000441218 ***
- quarterQ5 111.5467 12.9159 8.636 < 0.0000000000000002 ***
- state02 -182.7089 36.7495 -4.972 0.0000006664834133 ***
- state03 -302.4308 31.4091 -9.629 < 0.0000000000000002 ***
- state04 54.6670 157.9356 0.346 0.729243
- state05 -364.2626 49.7310 -7.325 0.0000000000002443 ***
- state06 -158.2256 34.1364 -4.635 0.0000035796630662 ***
- state07 103.8315 85.9533 1.208 0.227056
- state08 -232.7281 31.6792 -7.346 0.0000000000002077 ***
- state09 -625.4561 26.6275 -23.489 < 0.0000000000000002 ***
- state1 -329.4467 43.8025 -7.521 0.0000000000000555 ***
- state10 -228.9421 27.2168 -8.412 < 0.0000000000000002 ***
- state11 229.4834 90.4784 2.536 0.011206 *
- state12 8.6292 53.5113 0.161 0.871889
- state13 -414.2133 108.2687 -3.826 0.000131 ***
- state14 24.9990 33.2053 0.753 0.451537
- state15 -386.4679 38.6430 -10.001 < 0.0000000000000002 ***
- state16 -408.9313 35.7746 -11.431 < 0.0000000000000002 ***
- state17 -169.2716 39.6051 -4.274 0.0000192485172076 ***
- state18 -191.4328 31.9025 -6.001 0.0000000019842884 ***
- state19 -849.3718 25.4194 -33.414 < 0.0000000000000002 ***
- state2 -127.5145 72.9047 -1.749 0.080289 .
- state20 -627.1074 29.7862 -21.054 < 0.0000000000000002 ***
- state21 -739.1066 28.7611 -25.698 < 0.0000000000000002 ***
- state22 -873.9305 30.4180 -28.731 < 0.0000000000000002 ***
- state23 -803.9339 26.7202 -30.087 < 0.0000000000000002 ***
- state24 -541.8985 29.4421 -18.406 < 0.0000000000000002 ***
- state25 57.8958 313.5190 0.185 0.853493
- state26 -211.9977 199.1455 -1.065 0.287093
- state27 -594.3999 24.9000 -23.872 < 0.0000000000000002 ***
- state28 -381.2620 26.4571 -14.411 < 0.0000000000000002 ***
- state29 -367.1243 26.4633 -13.873 < 0.0000000000000002 ***
- state3 -479.1579 40.3655 -11.870 < 0.0000000000000002 ***
- state30 63.4353 74.2422 0.854 0.392869
- state31 157.6998 89.1517 1.769 0.076920 .
- state32 38.5378 26.3576 1.462 0.143717
- state33 -423.9537 25.3135 -16.748 < 0.0000000000000002 ***
- state34 -419.1475 40.5708 -10.331 < 0.0000000000000002 ***
- state35 416.7570 57.3598 7.266 0.0000000000003784 ***
- state36 -31.7199 29.3147 -1.082 0.279238
- state4 371.0493 182.0114 2.039 0.041496 *
- state5 -599.6303 58.8906 -10.182 < 0.0000000000000002 ***
- state6 -197.8508 55.0537 -3.594 0.000326 ***
- state7 253.3238 116.6312 2.172 0.029861 *
- state8 -291.8271 42.4682 -6.872 0.0000000000064448 ***
- state9 -734.4680 34.6586 -21.192 < 0.0000000000000002 ***
- ---
- Signif. codes: 0 *** 0.001 ** 0.01 * 0.05 . 0.1 1
- Residual standard error: 625.2 on 37580 degrees of freedom
- Multiple R-squared: 0.301, Adjusted R-squared: 0.3
- F-statistic: 299.7 on 54 and 37580 DF, p-value: < 0.00000000000000022
#+end_src
#+NAME: sassociation3
#+BEGIN_SRC R :results output graphics :exports results :file bsample2.png :width 2500 :height 1500 :res 300
library(data.table)
readRDS("plfsdata/plfsacjdata.rds")->worker
worker$standardwage->worker$wage
factor(worker$social_group)->worker$social_group
factor(worker$religion)->worker$religion
factor(worker$state)->worker$state
factor(worker$sector)->worker$sector
worker->t9
lm(wage~sex+age+years_edu+sector+social_group+religion+quarter+state,data=t9)->t
lm(log(wage)~sex+age+years_edu+sector+social_group+religion+quarter+state,data=t9)->t2
data.frame(yvar=t9$wage,residuals=residuals(t),variable="model1")->a
rbind(a,data.frame(yvar=log(t9$wage),residuals=residuals(t2),variable="model2"))->a
ggplot(a,aes(x=residuals,y=yvar,group=variable))->p
p+geom_point()+facet_wrap(.~variable,scales="free")
#+end_src
#+RESULTS: sassociation3
[[file:bsample2.png]]
#+NAME: roughwork
#+BEGIN_SRC R :results output list org :exports results
worker->t
t[,years_edu:=as.numeric(years_edu)]
t[years_edu==0,category:=3]
t[years_edu>0&years_edu<12,category:=2]
t[is.na(category),category:=1]
ifelse(t$years_edu==0,1,
ifelse(t$years_edu<12,2,3))->t$category
t[sex!=3,.(length(person_no)),.(category,sex)]->t
t[,prop:=V1/sum(V1),sex]
t
#+end_src
#+RESULTS: roughwork
#+begin_src org
- category sex V1 prop
- 1: 1 2 3697 0.52995986
- 2: 1 1 7066 0.23051577
- 3: 2 2 3047 0.43678326
- 4: 2 1 20363 0.66430692
- 5: 3 1 3224 0.10517731
- 6: 3 2 232 0.03325688
#+end_src

BIN
bsample2.png (Stored with Git LFS)

Binary file not shown.

@ -325,15 +325,9 @@ $H_{a}: \mu_{women} \neq \mu_{men}$
#+RESULTS: ttest2
#+begin_src org
- Welch Two Sample t-test
- data: wage by sex
- t = 79.02, df = 13483, p-value < 0.00000000000000022
- alternative hypothesis: true difference in means is not equal to 0
- 95 percent confidence interval:
- 104.6563 109.9805
- sample estimates:
- mean in group 1 mean in group 2
- 310.8974 203.5790
- Error in subset(worker, sex != 3) : object 'worker' not found
- Error in factor(t9$sex) : object 't9' not found
- Error in eval(m$data, parent.frame()) : object 't9' not found
#+end_src

Loading…
Cancel
Save