Files
quantitative-methods/associations.org
T
2019-11-30 14:25:09 +05:30

264 lines
10 KiB
Org Mode
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#+TITLE: Correlation and Regressions
#+PROPERTY: header-args:R :session acj :eval never-export
#+STARTUP: hideall inlineimages hideblocks
#+HTML_HEAD: <style>#content{max-width:1200px;} </style>
#+NAME: sassociation1
#+BEGIN_SRC R :results output list org
library(data.table)
readRDS("plfsdata/plfsacjdata.rds")->worker
worker$standardwage->worker$wage
factor(worker$social_group)->worker$social_group
factor(worker$religion)->worker$religion
factor(as.numeric(worker$state))->worker$state
factor(worker$sector)->worker$sector
cor.test(worker$wage,worker$years_edu)
cor.test(worker$wage,worker$age)
#+end_src
#+RESULTS: sassociation1
#+begin_src org
- Pearson's product-moment correlation
- data: worker$wage and worker$years_edu
- t = 35.998, df = 37633, p-value < 0.00000000000000022
- alternative hypothesis: true correlation is not equal to 0
- 95 percent confidence interval:
- 0.1726625 0.1921962
- sample estimates:
- cor
- 0.1824473
- Pearson's product-moment correlation
- data: worker$wage and worker$age
- t = 9.3777, df = 37633, p-value < 0.00000000000000022
- alternative hypothesis: true correlation is not equal to 0
- 95 percent confidence interval:
- 0.03819950 0.05835859
- sample estimates:
- cor
- 0.04828396
#+end_src
#+NAME: sassociation2
#+BEGIN_SRC R :results output
lm(wage~sex+age+years_edu,
data=worker)->t
summary(t)
lm(wage~sex+age+years_edu+sector+social_group+religion+quarter,
data=worker)->t
summary(t)
lm(wage~sex+age+years_edu+
sector+social_group+religion+quarter+state,
data=worker)->t
summary(t)
#+end_src
#+RESULTS: sassociation2
#+begin_example
Call:
lm(formula = wage ~ sex
age
years_edu, data = worker)
Residuals:
Min 1Q Median 3Q Max
-292.30 -74.80 -14.61 57.25 2170.72
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 327.62380 3.44462 95.11 <0.0000000000000002 ***
sex -97.04291 1.67477 -57.94 <0.0000000000000002 ***
age 1.38151 0.05526 25.00 <0.0000000000000002 ***
years_edu 4.48268 0.15777 28.41 <0.0000000000000002 ***
---
Signif. codes: 0 *** 0.001 ** 0.01 * 0.05 . 0.1 1
Residual standard error: 121.9 on 37631 degrees of freedom
Multiple R-squared: 0.1261, Adjusted R-squared: 0.126
F-statistic: 1809 on 3 and 37631 DF, p-value: < 0.00000000000000022
Call:
lm(formula = wage ~ sex
age
years_edu
sector
social_group
religion
quarter, data = worker)
Residuals:
Min 1Q Median 3Q Max
-315.65 -72.11 -14.15 52.51 2140.20
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 257.94757 3.99129 64.628 < 0.0000000000000002 ***
sex -86.56036 1.65149 -52.414 < 0.0000000000000002 ***
age 1.23305 0.05423 22.735 < 0.0000000000000002 ***
years_edu 3.76046 0.15523 24.225 < 0.0000000000000002 ***
sector2 30.32294 1.41191 21.477 < 0.0000000000000002 ***
social_group2 29.29921 2.21158 13.248 < 0.0000000000000002 ***
social_group3 38.85215 2.10791 18.432 < 0.0000000000000002 ***
social_group9 36.39078 2.37599 15.316 < 0.0000000000000002 ***
religion2 15.28020 1.88933 8.088 0.000000000000000626 ***
religion3 70.06425 3.05735 22.917 < 0.0000000000000002 ***
religion4 0.90753 4.34322 0.209 0.834486
religion5 9.92952 16.27422 0.610 0.541774
religion6 -27.61440 5.46808 -5.050 0.000000443610331942 ***
religion7 -28.12453 44.72117 -0.629 0.529427
religion9 22.53928 5.90700 3.816 0.000136 ***
quarterQ2 11.19254 1.98958 5.626 0.000000018618898647 ***
quarterQ3 15.15791 1.89871 7.983 0.000000000000001465 ***
quarterQ4 25.67351 1.85931 13.808 < 0.0000000000000002 ***
quarterQ5 32.23325 2.39793 13.442 < 0.0000000000000002 ***
---
Signif. codes: 0 *** 0.001 ** 0.01 * 0.05 . 0.1 1
Residual standard error: 118.3 on 37616 degrees of freedom
Multiple R-squared: 0.1778, Adjusted R-squared: 0.1774
F-statistic: 451.8 on 18 and 37616 DF, p-value: < 0.00000000000000022
Call:
lm(formula = wage ~ sex
age
years_edu
sector
social_group
religion
quarter
state, data = worker)
Residuals:
Min 1Q Median 3Q Max
-421.28 -53.87 -3.68 47.22 1919.02
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 444.01115 4.77338 93.018 < 0.0000000000000002 ***
sex -90.39577 1.35581 -66.673 < 0.0000000000000002 ***
age 0.09864 0.04447 2.218 0.026552 *
years_edu 0.78564 0.12845 6.116 0.00000000096630021 ***
sector2 20.38448 1.17589 17.335 < 0.0000000000000002 ***
social_group2 8.16327 1.89780 4.301 0.00001701239040995 ***
social_group3 7.00001 1.82389 3.838 0.000124 ***
social_group9 14.48824 2.02081 7.170 0.00000000000076649 ***
religion2 3.06706 1.61097 1.904 0.056937 .
religion3 3.99327 3.09503 1.290 0.196982
religion4 1.21196 4.75423 0.255 0.798784
religion5 18.31698 13.00836 1.408 0.159112
religion6 11.40087 4.59791 2.480 0.013158 *
religion7 1.18490 35.69842 0.033 0.973522
religion9 22.95917 4.86976 4.715 0.00000243003135992 ***
quarterQ2 10.21411 1.58877 6.429 0.00000000013002120 ***
quarterQ3 16.82258 1.51923 11.073 < 0.0000000000000002 ***
quarterQ4 25.14926 1.49016 16.877 < 0.0000000000000002 ***
quarterQ5 33.08983 1.95044 16.965 < 0.0000000000000002 ***
state02 -68.16250 5.62831 -12.111 < 0.0000000000000002 ***
state03 -96.24984 5.63379 -17.084 < 0.0000000000000002 ***
state04 -71.16512 23.87015 -2.981 0.002872 **
state05 -115.06098 7.55209 -15.236 < 0.0000000000000002 ***
state06 -69.65578 5.27142 -13.214 < 0.0000000000000002 ***
state07 -46.63160 13.00126 -3.587 0.000335 ***
state08 -93.14096 4.86947 -19.128 < 0.0000000000000002 ***
state09 -130.45617 4.11177 -31.728 < 0.0000000000000002 ***
state1 -14.79943 6.61379 -2.238 0.025249 *
state10 -111.30326 4.22934 -26.317 < 0.0000000000000002 ***
state11 -30.89194 13.71763 -2.252 0.024329 *
state12 -69.11195 8.17680 -8.452 < 0.0000000000000002 ***
state13 -23.54944 16.54163 -1.424 0.154557
state14 -58.14358 5.19732 -11.187 < 0.0000000000000002 ***
state15 -67.54526 6.62181 -10.200 < 0.0000000000000002 ***
state16 -45.92342 5.48738 -8.369 < 0.0000000000000002 ***
state17 -53.92123 6.67236 -8.081 0.00000000000000066 ***
state18 -86.09270 4.86428 -17.699 < 0.0000000000000002 ***
state19 -150.31537 3.89883 -38.554 < 0.0000000000000002 ***
state2 -56.38856 11.04817 -5.104 0.00000033437294375 ***
state20 -137.77298 4.59079 -30.011 < 0.0000000000000002 ***
state21 -147.27217 4.45380 -33.067 < 0.0000000000000002 ***
state22 -172.27146 4.70857 -36.587 < 0.0000000000000002 ***
state23 -161.52791 4.12939 -39.117 < 0.0000000000000002 ***
state24 -145.44044 4.54569 -31.995 < 0.0000000000000002 ***
state25 -63.12764 47.32958 -1.334 0.182281
state26 -102.84861 30.08812 -3.418 0.000631 ***
state27 -142.91329 3.89854 -36.658 < 0.0000000000000002 ***
state28 -84.41173 4.12350 -20.471 < 0.0000000000000002 ***
state29 -100.69647 4.08499 -24.650 < 0.0000000000000002 ***
state3 -121.25472 6.44927 -18.801 < 0.0000000000000002 ***
state30 25.50491 11.26961 2.263 0.023632 *
state31 112.13284 13.52131 8.293 < 0.0000000000000002 ***
state32 146.37176 4.10836 35.628 < 0.0000000000000002 ***
state33 -40.33733 3.97872 -10.138 < 0.0000000000000002 ***
state34 -61.09002 6.25575 -9.765 < 0.0000000000000002 ***
state35 17.72228 8.72443 2.031 0.042227 *
state36 -85.02720 4.53862 -18.734 < 0.0000000000000002 ***
state4 -68.28980 27.50122 -2.483 0.013027 *
state5 -144.20165 8.92408 -16.159 < 0.0000000000000002 ***
state6 -71.93804 8.39931 -8.565 < 0.0000000000000002 ***
state7 -41.36610 17.62915 -2.346 0.018958 *
state8 -103.71783 6.46870 -16.034 < 0.0000000000000002 ***
state9 -152.19543 5.27778 -28.837 < 0.0000000000000002 ***
---
Signif. codes: 0 *** 0.001 ** 0.01 * 0.05 . 0.1 1
Residual standard error: 94.36 on 37572 degrees of freedom
Multiple R-squared: 0.4771, Adjusted R-squared: 0.4763
F-statistic: 553 on 62 and 37572 DF, p-value: < 0.00000000000000022
#+end_example
#+NAME: sassociation3
#+BEGIN_SRC R :results output graphics :file bsample2.png :width 2500 :height 1500 :res 300
library(data.table)
readRDS("plfsdata/plfsacjdata.rds")->worker
worker$standardwage->worker$wage
factor(worker$social_group)->worker$social_group
factor(worker$religion)->worker$religion
factor(worker$state)->worker$state
factor(worker$sector)->worker$sector
worker->t9
lm(wage~sex+age+years_edu+sector+social_group+religion+quarter+state,data=t9)->t
lm(log(wage)~sex+age+years_edu+sector+social_group+religion+quarter+state,data=t9)->t2
data.frame(yvar=t9$wage,residuals=residuals(t),variable="model1")->a
rbind(a,data.frame(yvar=log(t9$wage),residuals=residuals(t2),variable="model2"))->a
ggplot(a,aes(x=residuals,y=yvar,group=variable))->p
p+geom_point()+facet_wrap(.~variable,scales="free")
#+end_src
#+RESULTS: sassociation3
[[file:bsample2.png]]
#+NAME: roughwork
#+BEGIN_SRC R :results output list org
worker->t
t[,years_edu:=as.numeric(years_edu)]
t[years_edu==0,category:=3]
t[years_edu>0&years_edu<12,category:=2]
t[is.na(category),category:=1]
ifelse(t$years_edu==0,1,
ifelse(t$years_edu<12,2,3))->t$category
t[sex!=3,.(length(person_no)),.(category,sex)]->t
t[,prop:=V1/sum(V1),sex]
t
#+end_src
#+RESULTS: roughwork
#+begin_src org
- category sex V1 prop
- 1: 1 2 3697 0.52995986
- 2: 1 1 7066 0.23051577
- 3: 2 2 3047 0.43678326
- 4: 2 1 20363 0.66430692
- 5: 3 1 3224 0.10517731
- 6: 3 2 232 0.03325688
#+end_src