You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

249 lines
11 KiB

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

#+TITLE: Correlation and Regressions
#+PROPERTY: header-args:R :session acj :eval never-export
#+STARTUP: hideall inlineimages hideblocks
#+HTML_HEAD: <style>#content{max-width:1200px;} </style>
#+NAME: sassociation1
#+BEGIN_SRC R :results output list org
library(data.table)
readRDS("plfsdata/plfsacjdata.rds")->worker
worker$standardwage->worker$wage
factor(worker$social_group)->worker$social_group
factor(worker$religion)->worker$religion
factor(as.numeric(worker$state))->worker$state
factor(worker$sector)->worker$sector
cor.test(worker$wage,worker$years_edu)
cor.test(worker$wage,worker$age)
#+end_src
#+RESULTS: sassociation1
#+begin_src org
- Pearson's product-moment correlation
- data: worker$wage and worker$years_edu
- t = 35.998, df = 37633, p-value < 0.00000000000000022
- alternative hypothesis: true correlation is not equal to 0
- 95 percent confidence interval:
- 0.1726625 0.1921962
- sample estimates:
- cor
- 0.1824473
- Pearson's product-moment correlation
- data: worker$wage and worker$age
- t = 9.3777, df = 37633, p-value < 0.00000000000000022
- alternative hypothesis: true correlation is not equal to 0
- 95 percent confidence interval:
- 0.03819950 0.05835859
- sample estimates:
- cor
- 0.04828396
#+end_src
#+NAME: sassociation2
#+BEGIN_SRC R :results output list org
lm(wage~sex+age+years_edu,
data=worker)->t
summary(t)
lm(wage~sex+age+years_edu+sector+social_group+religion+quarter,
data=worker)->t
summary(t)
lm(wage~sex+age+years_edu+
sector+social_group+religion+quarter+state,
data=worker)->t
summary(t)
#+end_src
#+RESULTS: sassociation2
#+begin_src org
- Call:
- lm(formula = wage ~ sex
- age
- years_edu, data = worker)
- Residuals:
- Min 1Q Median 3Q Max
- -292.30 -74.80 -14.61 57.25 2170.72
- Coefficients:
- Estimate Std. Error t value Pr(>|t|)
- (Intercept) 327.62380 3.44462 95.11 <0.0000000000000002 ***
- sex -97.04291 1.67477 -57.94 <0.0000000000000002 ***
- age 1.38151 0.05526 25.00 <0.0000000000000002 ***
- years_edu 4.48268 0.15777 28.41 <0.0000000000000002 ***
- ---
- Signif. codes: 0 *** 0.001 ** 0.01 * 0.05 . 0.1 1
- Residual standard error: 121.9 on 37631 degrees of freedom
- Multiple R-squared: 0.1261, Adjusted R-squared: 0.126
- F-statistic: 1809 on 3 and 37631 DF, p-value: < 0.00000000000000022
- Call:
- lm(formula = wage ~ sex
- age
- years_edu
- sector
- social_group
- religion
- quarter, data = worker)
- Residuals:
- Min 1Q Median 3Q Max
- -315.65 -72.11 -14.15 52.51 2140.20
- Coefficients:
- Estimate Std. Error t value Pr(>|t|)
- (Intercept) 257.94757 3.99129 64.628 < 0.0000000000000002 ***
- sex -86.56036 1.65149 -52.414 < 0.0000000000000002 ***
- age 1.23305 0.05423 22.735 < 0.0000000000000002 ***
- years_edu 3.76046 0.15523 24.225 < 0.0000000000000002 ***
- sector2 30.32294 1.41191 21.477 < 0.0000000000000002 ***
- social_group2 29.29921 2.21158 13.248 < 0.0000000000000002 ***
- social_group3 38.85215 2.10791 18.432 < 0.0000000000000002 ***
- social_group9 36.39078 2.37599 15.316 < 0.0000000000000002 ***
- religion2 15.28020 1.88933 8.088 0.000000000000000626 ***
- religion3 70.06425 3.05735 22.917 < 0.0000000000000002 ***
- religion4 0.90753 4.34322 0.209 0.834486
- religion5 9.92952 16.27422 0.610 0.541774
- religion6 -27.61440 5.46808 -5.050 0.000000443610331942 ***
- religion7 -28.12453 44.72117 -0.629 0.529427
- religion9 22.53928 5.90700 3.816 0.000136 ***
- quarterQ2 11.19254 1.98958 5.626 0.000000018618898647 ***
- quarterQ3 15.15791 1.89871 7.983 0.000000000000001465 ***
- quarterQ4 25.67351 1.85931 13.808 < 0.0000000000000002 ***
- quarterQ5 32.23325 2.39793 13.442 < 0.0000000000000002 ***
- ---
- Signif. codes: 0 *** 0.001 ** 0.01 * 0.05 . 0.1 1
- Residual standard error: 118.3 on 37616 degrees of freedom
- Multiple R-squared: 0.1778, Adjusted R-squared: 0.1774
- F-statistic: 451.8 on 18 and 37616 DF, p-value: < 0.00000000000000022
- Call:
- lm(formula = wage ~ sex
- age
- years_edu
- sector
- social_group
- religion
- quarter
- state, data = worker)
- Residuals:
- Min 1Q Median 3Q Max
- -421.28 -53.87 -3.68 47.22 1919.02
- Coefficients:
- Estimate Std. Error t value Pr(>|t|)
- (Intercept) 444.01115 4.77338 93.018 < 0.0000000000000002 ***
- sex -90.39577 1.35581 -66.673 < 0.0000000000000002 ***
- age 0.09864 0.04447 2.218 0.026552 *
- years_edu 0.78564 0.12845 6.116 0.00000000096630021 ***
- sector2 20.38448 1.17589 17.335 < 0.0000000000000002 ***
- social_group2 8.16327 1.89780 4.301 0.00001701239040995 ***
- social_group3 7.00001 1.82389 3.838 0.000124 ***
- social_group9 14.48824 2.02081 7.170 0.00000000000076649 ***
- religion2 3.06706 1.61097 1.904 0.056937 .
- religion3 3.99327 3.09503 1.290 0.196982
- religion4 1.21196 4.75423 0.255 0.798784
- religion5 18.31698 13.00836 1.408 0.159112
- religion6 11.40087 4.59791 2.480 0.013158 *
- religion7 1.18490 35.69842 0.033 0.973522
- religion9 22.95917 4.86976 4.715 0.00000243003135992 ***
- quarterQ2 10.21411 1.58877 6.429 0.00000000013002120 ***
- quarterQ3 16.82258 1.51923 11.073 < 0.0000000000000002 ***
- quarterQ4 25.14926 1.49016 16.877 < 0.0000000000000002 ***
- quarterQ5 33.08983 1.95044 16.965 < 0.0000000000000002 ***
- state02 -68.16250 5.62831 -12.111 < 0.0000000000000002 ***
- state03 -96.24984 5.63379 -17.084 < 0.0000000000000002 ***
- state04 -71.16512 23.87015 -2.981 0.002872 **
- state05 -115.06098 7.55209 -15.236 < 0.0000000000000002 ***
- state06 -69.65578 5.27142 -13.214 < 0.0000000000000002 ***
- state07 -46.63160 13.00126 -3.587 0.000335 ***
- state08 -93.14096 4.86947 -19.128 < 0.0000000000000002 ***
- state09 -130.45617 4.11177 -31.728 < 0.0000000000000002 ***
- state1 -14.79943 6.61379 -2.238 0.025249 *
- state10 -111.30326 4.22934 -26.317 < 0.0000000000000002 ***
- state11 -30.89194 13.71763 -2.252 0.024329 *
- state12 -69.11195 8.17680 -8.452 < 0.0000000000000002 ***
- state13 -23.54944 16.54163 -1.424 0.154557
- state14 -58.14358 5.19732 -11.187 < 0.0000000000000002 ***
- state15 -67.54526 6.62181 -10.200 < 0.0000000000000002 ***
- state16 -45.92342 5.48738 -8.369 < 0.0000000000000002 ***
- state17 -53.92123 6.67236 -8.081 0.00000000000000066 ***
- state18 -86.09270 4.86428 -17.699 < 0.0000000000000002 ***
- state19 -150.31537 3.89883 -38.554 < 0.0000000000000002 ***
- state2 -56.38856 11.04817 -5.104 0.00000033437294375 ***
- state20 -137.77298 4.59079 -30.011 < 0.0000000000000002 ***
- state21 -147.27217 4.45380 -33.067 < 0.0000000000000002 ***
- state22 -172.27146 4.70857 -36.587 < 0.0000000000000002 ***
- state23 -161.52791 4.12939 -39.117 < 0.0000000000000002 ***
- state24 -145.44044 4.54569 -31.995 < 0.0000000000000002 ***
- state25 -63.12764 47.32958 -1.334 0.182281
- state26 -102.84861 30.08812 -3.418 0.000631 ***
- state27 -142.91329 3.89854 -36.658 < 0.0000000000000002 ***
- state28 -84.41173 4.12350 -20.471 < 0.0000000000000002 ***
- state29 -100.69647 4.08499 -24.650 < 0.0000000000000002 ***
- state3 -121.25472 6.44927 -18.801 < 0.0000000000000002 ***
- state30 25.50491 11.26961 2.263 0.023632 *
- state31 112.13284 13.52131 8.293 < 0.0000000000000002 ***
- state32 146.37176 4.10836 35.628 < 0.0000000000000002 ***
- state33 -40.33733 3.97872 -10.138 < 0.0000000000000002 ***
- state34 -61.09002 6.25575 -9.765 < 0.0000000000000002 ***
- state35 17.72228 8.72443 2.031 0.042227 *
- state36 -85.02720 4.53862 -18.734 < 0.0000000000000002 ***
- state4 -68.28980 27.50122 -2.483 0.013027 *
- state5 -144.20165 8.92408 -16.159 < 0.0000000000000002 ***
- state6 -71.93804 8.39931 -8.565 < 0.0000000000000002 ***
- state7 -41.36610 17.62915 -2.346 0.018958 *
- state8 -103.71783 6.46870 -16.034 < 0.0000000000000002 ***
- state9 -152.19543 5.27778 -28.837 < 0.0000000000000002 ***
- ---
- Signif. codes: 0 *** 0.001 ** 0.01 * 0.05 . 0.1 1
- Residual standard error: 94.36 on 37572 degrees of freedom
- Multiple R-squared: 0.4771, Adjusted R-squared: 0.4763
- F-statistic: 553 on 62 and 37572 DF, p-value: < 0.00000000000000022
#+end_src
#+NAME: sassociation3
#+BEGIN_SRC R :results output graphics :file bsample2.png :width 2500 :height 1500 :res 300
library(data.table)
readRDS("plfsdata/plfsacjdata.rds")->worker
worker$standardwage->worker$wage
factor(worker$social_group)->worker$social_group
factor(worker$religion)->worker$religion
factor(worker$state)->worker$state
factor(worker$sector)->worker$sector
worker->t9
lm(wage~sex+age+years_edu+sector+social_group+religion+quarter+state,data=t9)->t
lm(log(wage)~sex+age+years_edu+sector+social_group+religion+quarter+state,data=t9)->t2
data.frame(yvar=t9$wage,residuals=residuals(t),variable="model1")->a
rbind(a,data.frame(yvar=log(t9$wage),residuals=residuals(t2),variable="model2"))->a
ggplot(a,aes(x=residuals,y=yvar,group=variable))->p
p+geom_point()+facet_wrap(.~variable,scales="free")
#+end_src
#+RESULTS: sassociation3
[[file:bsample2.png]]
#+NAME: roughwork
#+BEGIN_SRC R :results value :colnames yes :hlines
worker->t
t[,years_edu:=as.numeric(years_edu)]
t[years_edu==0,category:=3]
t[years_edu>0&years_edu<12,category:=2]
t[is.na(category),category:=1]
ifelse(t$years_edu==0,1,
ifelse(t$years_edu<12,2,3))->t$category
t[sex!=3,.(length(person_no)),.(category,sex)]->t
t[,prop:=V1/sum(V1),sex]
t
#+end_src
#+RESULTS: roughwork
| category | sex | V1 | prop |
|----------+-----+-------+-------------------|
| 1 | 2 | 3697 | 0.529959862385321 |
| 1 | 1 | 7066 | 0.230515773333768 |
| 2 | 2 | 3047 | 0.436783256880734 |
| 2 | 1 | 20363 | 0.664306919387988 |
| 3 | 1 | 3224 | 0.105177307278244 |
| 3 | 2 | 232 | 0.033256880733945 |