You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

264 lines
10 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

#+TITLE: Correlation and Regressions
#+PROPERTY: header-args:R :session acj :eval never-export
#+STARTUP: hideall inlineimages hideblocks
#+HTML_HEAD: <style>#content{max-width:1200px;} </style>
#+NAME: sassociation1
#+BEGIN_SRC R :results output list org
library(data.table)
readRDS("plfsdata/plfsacjdata.rds")->worker
worker$standardwage->worker$wage
factor(worker$social_group)->worker$social_group
factor(worker$religion)->worker$religion
factor(as.numeric(worker$state))->worker$state
factor(worker$sector)->worker$sector
cor.test(worker$wage,worker$years_edu)
cor.test(worker$wage,worker$age)
#+end_src
#+RESULTS: sassociation1
#+begin_src org
- Pearson's product-moment correlation
- data: worker$wage and worker$years_edu
- t = 35.998, df = 37633, p-value < 0.00000000000000022
- alternative hypothesis: true correlation is not equal to 0
- 95 percent confidence interval:
- 0.1726625 0.1921962
- sample estimates:
- cor
- 0.1824473
- Pearson's product-moment correlation
- data: worker$wage and worker$age
- t = 9.3777, df = 37633, p-value < 0.00000000000000022
- alternative hypothesis: true correlation is not equal to 0
- 95 percent confidence interval:
- 0.03819950 0.05835859
- sample estimates:
- cor
- 0.04828396
#+end_src
#+NAME: sassociation2
#+BEGIN_SRC R :results output
lm(wage~sex+age+years_edu,
data=worker)->t
summary(t)
lm(wage~sex+age+years_edu+sector+social_group+religion+quarter,
data=worker)->t
summary(t)
lm(wage~sex+age+years_edu+
sector+social_group+religion+quarter+state,
data=worker)->t
summary(t)
#+end_src
#+RESULTS: sassociation2
#+begin_example
Call:
lm(formula = wage ~ sex
age
years_edu, data = worker)
Residuals:
Min 1Q Median 3Q Max
-292.30 -74.80 -14.61 57.25 2170.72
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 327.62380 3.44462 95.11 <0.0000000000000002 ***
sex -97.04291 1.67477 -57.94 <0.0000000000000002 ***
age 1.38151 0.05526 25.00 <0.0000000000000002 ***
years_edu 4.48268 0.15777 28.41 <0.0000000000000002 ***
---
Signif. codes: 0 *** 0.001 ** 0.01 * 0.05 . 0.1 1
Residual standard error: 121.9 on 37631 degrees of freedom
Multiple R-squared: 0.1261, Adjusted R-squared: 0.126
F-statistic: 1809 on 3 and 37631 DF, p-value: < 0.00000000000000022
Call:
lm(formula = wage ~ sex
age
years_edu
sector
social_group
religion
quarter, data = worker)
Residuals:
Min 1Q Median 3Q Max
-315.65 -72.11 -14.15 52.51 2140.20
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 257.94757 3.99129 64.628 < 0.0000000000000002 ***
sex -86.56036 1.65149 -52.414 < 0.0000000000000002 ***
age 1.23305 0.05423 22.735 < 0.0000000000000002 ***
years_edu 3.76046 0.15523 24.225 < 0.0000000000000002 ***
sector2 30.32294 1.41191 21.477 < 0.0000000000000002 ***
social_group2 29.29921 2.21158 13.248 < 0.0000000000000002 ***
social_group3 38.85215 2.10791 18.432 < 0.0000000000000002 ***
social_group9 36.39078 2.37599 15.316 < 0.0000000000000002 ***
religion2 15.28020 1.88933 8.088 0.000000000000000626 ***
religion3 70.06425 3.05735 22.917 < 0.0000000000000002 ***
religion4 0.90753 4.34322 0.209 0.834486
religion5 9.92952 16.27422 0.610 0.541774
religion6 -27.61440 5.46808 -5.050 0.000000443610331942 ***
religion7 -28.12453 44.72117 -0.629 0.529427
religion9 22.53928 5.90700 3.816 0.000136 ***
quarterQ2 11.19254 1.98958 5.626 0.000000018618898647 ***
quarterQ3 15.15791 1.89871 7.983 0.000000000000001465 ***
quarterQ4 25.67351 1.85931 13.808 < 0.0000000000000002 ***
quarterQ5 32.23325 2.39793 13.442 < 0.0000000000000002 ***
---
Signif. codes: 0 *** 0.001 ** 0.01 * 0.05 . 0.1 1
Residual standard error: 118.3 on 37616 degrees of freedom
Multiple R-squared: 0.1778, Adjusted R-squared: 0.1774
F-statistic: 451.8 on 18 and 37616 DF, p-value: < 0.00000000000000022
Call:
lm(formula = wage ~ sex
age
years_edu
sector
social_group
religion
quarter
state, data = worker)
Residuals:
Min 1Q Median 3Q Max
-421.28 -53.87 -3.68 47.22 1919.02
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 444.01115 4.77338 93.018 < 0.0000000000000002 ***
sex -90.39577 1.35581 -66.673 < 0.0000000000000002 ***
age 0.09864 0.04447 2.218 0.026552 *
years_edu 0.78564 0.12845 6.116 0.00000000096630021 ***
sector2 20.38448 1.17589 17.335 < 0.0000000000000002 ***
social_group2 8.16327 1.89780 4.301 0.00001701239040995 ***
social_group3 7.00001 1.82389 3.838 0.000124 ***
social_group9 14.48824 2.02081 7.170 0.00000000000076649 ***
religion2 3.06706 1.61097 1.904 0.056937 .
religion3 3.99327 3.09503 1.290 0.196982
religion4 1.21196 4.75423 0.255 0.798784
religion5 18.31698 13.00836 1.408 0.159112
religion6 11.40087 4.59791 2.480 0.013158 *
religion7 1.18490 35.69842 0.033 0.973522
religion9 22.95917 4.86976 4.715 0.00000243003135992 ***
quarterQ2 10.21411 1.58877 6.429 0.00000000013002120 ***
quarterQ3 16.82258 1.51923 11.073 < 0.0000000000000002 ***
quarterQ4 25.14926 1.49016 16.877 < 0.0000000000000002 ***
quarterQ5 33.08983 1.95044 16.965 < 0.0000000000000002 ***
state02 -68.16250 5.62831 -12.111 < 0.0000000000000002 ***
state03 -96.24984 5.63379 -17.084 < 0.0000000000000002 ***
state04 -71.16512 23.87015 -2.981 0.002872 **
state05 -115.06098 7.55209 -15.236 < 0.0000000000000002 ***
state06 -69.65578 5.27142 -13.214 < 0.0000000000000002 ***
state07 -46.63160 13.00126 -3.587 0.000335 ***
state08 -93.14096 4.86947 -19.128 < 0.0000000000000002 ***
state09 -130.45617 4.11177 -31.728 < 0.0000000000000002 ***
state1 -14.79943 6.61379 -2.238 0.025249 *
state10 -111.30326 4.22934 -26.317 < 0.0000000000000002 ***
state11 -30.89194 13.71763 -2.252 0.024329 *
state12 -69.11195 8.17680 -8.452 < 0.0000000000000002 ***
state13 -23.54944 16.54163 -1.424 0.154557
state14 -58.14358 5.19732 -11.187 < 0.0000000000000002 ***
state15 -67.54526 6.62181 -10.200 < 0.0000000000000002 ***
state16 -45.92342 5.48738 -8.369 < 0.0000000000000002 ***
state17 -53.92123 6.67236 -8.081 0.00000000000000066 ***
state18 -86.09270 4.86428 -17.699 < 0.0000000000000002 ***
state19 -150.31537 3.89883 -38.554 < 0.0000000000000002 ***
state2 -56.38856 11.04817 -5.104 0.00000033437294375 ***
state20 -137.77298 4.59079 -30.011 < 0.0000000000000002 ***
state21 -147.27217 4.45380 -33.067 < 0.0000000000000002 ***
state22 -172.27146 4.70857 -36.587 < 0.0000000000000002 ***
state23 -161.52791 4.12939 -39.117 < 0.0000000000000002 ***
state24 -145.44044 4.54569 -31.995 < 0.0000000000000002 ***
state25 -63.12764 47.32958 -1.334 0.182281
state26 -102.84861 30.08812 -3.418 0.000631 ***
state27 -142.91329 3.89854 -36.658 < 0.0000000000000002 ***
state28 -84.41173 4.12350 -20.471 < 0.0000000000000002 ***
state29 -100.69647 4.08499 -24.650 < 0.0000000000000002 ***
state3 -121.25472 6.44927 -18.801 < 0.0000000000000002 ***
state30 25.50491 11.26961 2.263 0.023632 *
state31 112.13284 13.52131 8.293 < 0.0000000000000002 ***
state32 146.37176 4.10836 35.628 < 0.0000000000000002 ***
state33 -40.33733 3.97872 -10.138 < 0.0000000000000002 ***
state34 -61.09002 6.25575 -9.765 < 0.0000000000000002 ***
state35 17.72228 8.72443 2.031 0.042227 *
state36 -85.02720 4.53862 -18.734 < 0.0000000000000002 ***
state4 -68.28980 27.50122 -2.483 0.013027 *
state5 -144.20165 8.92408 -16.159 < 0.0000000000000002 ***
state6 -71.93804 8.39931 -8.565 < 0.0000000000000002 ***
state7 -41.36610 17.62915 -2.346 0.018958 *
state8 -103.71783 6.46870 -16.034 < 0.0000000000000002 ***
state9 -152.19543 5.27778 -28.837 < 0.0000000000000002 ***
---
Signif. codes: 0 *** 0.001 ** 0.01 * 0.05 . 0.1 1
Residual standard error: 94.36 on 37572 degrees of freedom
Multiple R-squared: 0.4771, Adjusted R-squared: 0.4763
F-statistic: 553 on 62 and 37572 DF, p-value: < 0.00000000000000022
#+end_example
#+NAME: sassociation3
#+BEGIN_SRC R :results output graphics :file bsample2.png :width 2500 :height 1500 :res 300
library(data.table)
readRDS("plfsdata/plfsacjdata.rds")->worker
worker$standardwage->worker$wage
factor(worker$social_group)->worker$social_group
factor(worker$religion)->worker$religion
factor(worker$state)->worker$state
factor(worker$sector)->worker$sector
worker->t9
lm(wage~sex+age+years_edu+sector+social_group+religion+quarter+state,data=t9)->t
lm(log(wage)~sex+age+years_edu+sector+social_group+religion+quarter+state,data=t9)->t2
data.frame(yvar=t9$wage,residuals=residuals(t),variable="model1")->a
rbind(a,data.frame(yvar=log(t9$wage),residuals=residuals(t2),variable="model2"))->a
ggplot(a,aes(x=residuals,y=yvar,group=variable))->p
p+geom_point()+facet_wrap(.~variable,scales="free")
#+end_src
#+RESULTS: sassociation3
[[file:bsample2.png]]
#+NAME: roughwork
#+BEGIN_SRC R :results output list org
worker->t
t[,years_edu:=as.numeric(years_edu)]
t[years_edu==0,category:=3]
t[years_edu>0&years_edu<12,category:=2]
t[is.na(category),category:=1]
ifelse(t$years_edu==0,1,
ifelse(t$years_edu<12,2,3))->t$category
t[sex!=3,.(length(person_no)),.(category,sex)]->t
t[,prop:=V1/sum(V1),sex]
t
#+end_src
#+RESULTS: roughwork
#+begin_src org
- category sex V1 prop
- 1: 1 2 3697 0.52995986
- 2: 1 1 7066 0.23051577
- 3: 2 2 3047 0.43678326
- 4: 2 1 20363 0.66430692
- 5: 3 1 3224 0.10517731
- 6: 3 2 232 0.03325688
#+end_src