From f27ae391b25fffef567f4bfc961f80dc747577eb Mon Sep 17 00:00:00 2001 From: Vikas Rawal Date: Wed, 27 Nov 2019 16:07:45 +0530 Subject: [PATCH] Initialised --- .gitattributes | 1 + .gitignore | 6 + acjlectures.org | 481 ++++++++++++++++++++++++++++++++++++ graphics/chart1.png | 3 + graphics/chart2.png | 3 + graphics/chart3.png | 3 + graphics/nobel-right.png | 3 + graphics/nobel-wrong.png | 3 + graphics/piketty1_c.png | 3 + graphics/piketty1_o.png | 3 + graphics/piketty2_c.png | 3 + graphics/piketty2_o.png | 3 + graphics/tufte-fuel.png | 3 + graphics/tufte-fuel2.png | 3 + graphics/tufte-insanity.png | 3 + 15 files changed, 524 insertions(+) create mode 100644 .gitattributes create mode 100644 .gitignore create mode 100644 acjlectures.org create mode 100644 graphics/chart1.png create mode 100644 graphics/chart2.png create mode 100644 graphics/chart3.png create mode 100644 graphics/nobel-right.png create mode 100644 graphics/nobel-wrong.png create mode 100644 graphics/piketty1_c.png create mode 100644 graphics/piketty1_o.png create mode 100644 graphics/piketty2_c.png create mode 100644 graphics/piketty2_o.png create mode 100644 graphics/tufte-fuel.png create mode 100644 graphics/tufte-fuel2.png create mode 100644 graphics/tufte-insanity.png diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..24a8e87 --- /dev/null +++ b/.gitattributes @@ -0,0 +1 @@ +*.png filter=lfs diff=lfs merge=lfs -text diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..d0b797c --- /dev/null +++ b/.gitignore @@ -0,0 +1,6 @@ +* +!acjlectures.org +!.gitignore +!.gitattributes +!graphics +!*.png diff --git a/acjlectures.org b/acjlectures.org new file mode 100644 index 0000000..88b8156 --- /dev/null +++ b/acjlectures.org @@ -0,0 +1,481 @@ +#+TITLE: Quantitative Methods +#+PROPERTY: header-args:R :session acj :eval never-export +#+STARTUP: hideall inlineimages hideblocks + +* Title slide :slide: +#+BEGIN_SRC emacs-lisp-slide +(org-show-animate '("Quantitative Methods, Part-II" "Vikas Rawal" "Prachi Bansal" "" "" "")) +#+END_SRC +* Lecture plan +** Descriptive Statistics [1 day] +*** Measures of central tendency +*** Dispersion +*** Making Data Meaningful +**** Cross-tabulations +**** Data visualisation +** Probability and Inference [2 days] +*** Probability/Relevance to statistics +*** Sampling distributions +*** Hypothesis testing +** Correlations and Regression [1 day] +** All afternoons will be used for workshops + +* Day 1 +** Title slide +#+BEGIN_SRC emacs-lisp-slide +(org-show-animate '("Why do financial journalists need to know quantitative methods?" "" "" "")) +#+END_SRC + +** What do we aim to achieve in this course? :slide: +**** Make friends with numbers +**** Learn how to read numbers, how to present them, and how to write about them +**** Learn how to use computers to work with numbers +** Two Types of Statistics :slide: +*** Descriptive Statistics +**** Use summaries of data for the entire population to describe a population +**** Use summaries of sample data to describe a sample +*** Inferential Statistics +**** Use sample data to describe a population +** Descriptive Statistics :slide: + ++ Frequency ++ Measures of central tendency ++ Summary positions ++ Measures of dispersion + +*** Frequency :slide: + +#+NAME: worker-code0 +#+begin_src R :results value :export results :colnames yes :hline + library(data.table) + data.table(names=c("Anil","Neeraj","Savita","Srimati", + "Rekha","Pooja","Alex","Shahina", + "Ghazal","Lakshmi","Rahul","Shahrukh", + "Naman","Deepak","Shreya","Rukhsana" + ), + salary=c(71,50,65,40, + 45,42,46,43, + 45,43,45,45, + 850,100,46,48 + )*1000, + sex=c("M","M","F","F", + "F","F","M","F", + "F","F","M","M", + "M","M","F","F" + ))->workers + workers$sno<-c(1:nrow(workers)) + workers[,.(sno,names,sex,salary)] +#+end_src + +#+RESULTS: worker-code0 +| sno | names | sex | salary | +|-----+----------+-----+--------| +| 1 | Anil | M | 71000 | +| 2 | Neeraj | M | 50000 | +| 3 | Savita | F | 65000 | +| 4 | Srimati | F | 40000 | +| 5 | Rekha | F | 45000 | +| 6 | Pooja | F | 42000 | +| 7 | Alex | M | 46000 | +| 8 | Shahina | F | 43000 | +| 9 | Ghazal | F | 45000 | +| 10 | Lakshmi | F | 43000 | +| 11 | Rahul | M | 45000 | +| 12 | Shahrukh | M | 45000 | +| 13 | Naman | M | 850000 | +| 14 | Deepak | M | 1e+05 | +| 15 | Shreya | F | 46000 | +| 16 | Rukhsana | F | 48000 | + +#+NAME: freq-code + +#+begin_src R :results value :export results :colnames yes :hline + workers[,.(frequency=length(sno)),.(sex)] +#+end_src + +#+RESULTS: +| sex | frequency | +|-----+-----------| +| M | 7 | +| F | 9 | + +#+RESULTS: freq-code +| sex | frequency | +|-----+-----------| +| M | 7 | +| F | 9 | + +*** Measures of Central Tendency :slide: + +#+NAME: mid-code +#+begin_src R :results value :export results :colnames yes :hline + workers[,.(mean_salary=round(mean(salary),1), + median_salary=quantile(salary,prob=0.5))] +#+end_src + +#+RESULTS: mid-code +| mean_salary | median_salary | +|-------------+---------------| +| 101500 | 45500 | + +#+NAME: mid2-code +#+begin_src R :results value :export results :colnames yes :hline + workers[,.(mean_salary=round(mean(salary),1), + median_salary=quantile(salary,prob=0.5)),.(sex)] +#+end_src + +#+RESULTS: mid2-code +| sex | mean_salary | median_salary | +|-----+-------------+---------------| +| M | 172428.6 | 50000 | +| F | 46333.3 | 45000 | + +*** Measures of Position :slide: + ++ First quartile ++ Second quartile (median) ++ Third quartile + ++ Deciles ++ Quintiles ++ Percentiles + +*** Measures of Dispersion :slide: + +**** Range and other measures based on positions :slide: + + +$range=max-min$ + +#+RESULTS: range-code +| min_salary | max_salary | range | +|------------+------------+--------| +| 40000 | 850000 | 810000 | + +#+NAME: range-code +#+begin_src R :results value :export results :colnames yes :hline + workers[,.(min_salary=min(salary), + max_salary=max(salary), + range=max(salary)-min(salary))] +#+end_src + +**** Range and other measures based on positions :slide: + ++ Distance between any two positions (Deciles, Quintiles, Percentiles) can be used as a measure of dispersion. + +$inter.quartile.range=Q3-Q1$ + +#+RESULTS: summary-code +#+begin_example + 25% 75% +44500 53750 + 10% 90% +42500 85500 + 10% 95% + 42500 287500 + 25% 95% + 44500 287500 + 0% 75% +40000 53750 +#+end_example + +#+NAME: summary-code +#+begin_src R :results output :export results :colnames yes :hline +## summary(workers$salary) + quantile(workers$salary,probs=c(0.25,0.75)) + quantile(workers$salary,probs=c(0.1,0.9)) + quantile(workers$salary,probs=c(0.1,0.95)) + quantile(workers$salary,probs=c(0.25,0.95)) + quantile(workers$salary,probs=c(0,0.75)) +#+end_src + + + +**** Variance, Standard Deviation and Coefficient of Variation + +$variance=\frac{1}{n} \times \sum(x_{i}-x)^{2}$ + +$standard.deviation = \sqrt{variance}$ + +$cov=\frac{standard.deviation}{mean}$ + +#+NAME: var-code +#+begin_src R :results value :export results :colnames yes :hline + workers[,.(var_salary=round(var(salary),1), + sd_salary=round(sqrt(var(salary)),1), + cov_salary=round(sqrt(var(salary))/mean(salary),2)) + ] +#+end_src + +#+RESULTS: var-code +| var_salary | sd_salary | cov_salary | +|-------------+-----------+------------| +| 40075200000 | 200187.9 | 1.97 | + +#+NAME: var2-code +#+begin_src R :results value :export results :colnames yes :hline + students[,.(var_salary=round(var(salary),1), + sd_salary=round(sqrt(var(salary)),1), + cov_salary=round(sqrt(var(salary))/mean(salary),2)),.(sex)] +#+end_src + +#+RESULTS: var2-code +| sex | var_salary | sd_salary | cov_salary | +|-----+-------------+-----------+------------| +| M | 89680952381 | 299467.8 | 1.74 | +| F | 54500000 | 7382.4 | 0.16 | + + + +** Graphical Displays of Quantitative Information: Common Pitfalls + +*** Common uses of statistical graphics :slide: ++ To show trends over time ++ To show mid-point variations across categories ++ To show composition ++ (less commonly, though more usefully) to show/analyse dispersion + +*** Mis-representation :slide: + +#+CAPTION: "and sometimes the fact that numbers have a magnitude as well as an order is simply forgotten" +[[file:graphics/tufte-insanity.png]] + +*** Mis-representation :slide: + +#+CAPTION: Another example borrowed from Tufte +[[file:graphics/tufte-fuel.png]] + +*** Mis-representation :slide: + +#+CAPTION: Tufte's graph on fuel economy of cars +#+attr_html: :width 400px +[[file:graphics/tufte-fuel2.png]] + +*** Mis-representation :slide: + +#+CAPTION: Nobel prizes awarded in science (National Science Foundation, 1974) +#+attr_html: :width 300px +[[file:graphics/nobel-wrong.png]] + +*** Mis-representation :slide: + +#+CAPTION: Nobel prizes awarded in science (corrected by Tufte) +#+attr_html: :width 300px +[[file:graphics/nobel-right.png]] + +*** Mis-representation: illustrations from Thomas Piketty's work (source Noah Wright) :slide: + +[[file:graphics/piketty1_o.png]] + +*** Mis-representation: illustrations from Thomas Piketty's work (source Noah Wright) :slide: + +[[file:graphics/piketty1_c.png]] + +*** Mis-representation: illustrations from Thomas Piketty's work (source Noah Wright) :slide: + +[[file:graphics/piketty2_o.png]] + +*** Mis-representation: illustrations from Thomas Piketty's work (source Noah Wright) :slide: + +[[file:graphics/piketty2_c.png]] + +*** The problem multiplied with the coming in of spreadsheets :slide: + +#+ATTR_html: :width 300px +[[file:graphics/chart1.png]] + +#+ATTR_html: :width 300px +[[file:graphics/chart2.png]] + +#+ATTR_html: :width 300px +[[file:graphics/chart3.png]] + +** Graphical Displays of Quantitative Information: Dispersion :slide: +*** Histogram :slide: + +#+RESULTS: ccpc-wheat-hist1 +#+attr_html: :width 800px +[[file:productionhist1.png]] + +#+NAME: ccpc-wheat-hist1 +#+BEGIN_SRC R :results output graphics :exports results :file productionhist1.png :width 400 :height 300 :type cairo :family Garamond + subset(ccpc,Year_Agriculture==2009)->b + subset(b,Crop_code %in% 20 )->b + b$Main_Product_Qtls*100/b$Crop_Area_Ha->b$yield + hist(b$yield,main="Histogram of wheat yields",ylim=c(0,4000)) +#+END_SRC + +*** Histogram with smaller bins + +#+RESULTS: ccpc-wheat-hist11 +[[file:productionhist11.png]] + +#+NAME: ccpc-wheat-hist11 +#+BEGIN_SRC R :results output graphics :exports results :file productionhist11.png :width 400 :height 300 :type cairo :family Garamond + subset(ccpc,Year_Agriculture==2009)->b + subset(b,Crop_code %in% 20 )->b + b$Main_Product_Qtls*100/b$Crop_Area_Ha->b$yield + hist(b$yield,main="Histogram of wheat yields",breaks=seq(0,25000,250),ylim=c(0,4000)) +#+END_SRC + +*** Histogram with smaller bins + +#+RESULTS: ccpc-wheat-hist12 +[[file:productionhist12.png]] + +#+NAME: ccpc-wheat-hist12 +#+BEGIN_SRC R :results output graphics :exports results :file productionhist12.png :width 400 :height 300 :type cairo :family Garamond + subset(ccpc,Year_Agriculture==2009)->b + subset(b,Crop_code %in% 20 )->b + b$Main_Product_Qtls*100/b$Crop_Area_Ha->b$yield + hist(b$yield,main="Histogram of wheat yields",breaks=seq(0,25000,250)) +#+END_SRC + +*** Histogram (absolute frequencies) with unequal bins distorts the shape +#+NAME: ccpc-wheat-hist3 +#+BEGIN_SRC R :results output graphics :exports results :file productionhist3.png :width 400 :height 300 :type cairo :family Garamond + subset(ccpc,Year_Agriculture==2009)->b + subset(b,Crop_code %in% 20 )->b + b$Main_Product_Qtls*100/b$Crop_Area_Ha->b$yield + hist(b$yield,freq=T,main="Histogram of wheat yields",breaks=c(0,1000,1500,2000,2200,2500,3000,3200,3400,3800,4000,5000,10000,21000)) +#+END_SRC + +#+RESULTS: ccpc-wheat-hist3 +[[file:productionhist3.png]] + +*** Histogram with relative densities :slide: + +#+RESULTS: ccpc-wheat-hist2 +#+attr_html: :width 600px +[[file:productionhist2.png]] + +#+NAME: ccpc-wheat-hist2 +#+BEGIN_SRC R :results output graphics :exports results :file productionhist2.png :width 400 :height 300 :type cairo :family Garamond + subset(ccpc,Year_Agriculture==2009)->b + subset(b,Crop_code %in% 20)->b + b$Main_Product_Qtls*100/b$Crop_Area_Ha->b$yield + hist(b$yield,freq=F,main="Histogram of wheat yields",ylim=c(0,0.00040)) +#+END_SRC + +*** Histogram with relative densities + +#+RESULTS: ccpc-wheat-hist21 +[[file:productionhist21.png]] + +#+NAME: ccpc-wheat-hist21 +#+BEGIN_SRC R :results output graphics :exports results :file productionhist21.png :width 400 :height 300 :type cairo :family Garamond + subset(ccpc,Year_Agriculture==2009)->b + subset(b,Crop_code %in% 20)->b + b$Main_Product_Qtls*100/b$Crop_Area_Ha->b$yield + hist(b$yield,freq=F,main="Histogram of wheat yields",breaks=seq(0,25000,250),ylim=c(0,0.00040)) +#+END_SRC + +*** Histogram with unequal bins must use relative densities + +#+RESULTS: ccpc-wheat-hist4 +[[file:productionhist4.png]] + +#+NAME: ccpc-wheat-hist4 +#+BEGIN_SRC R :results output graphics :exports results :file productionhist4.png :width 400 :height 300 :type cairo :family Garamond + subset(ccpc,Year_Agriculture==2009)->b + subset(b,Crop_code %in% 20 )->b + b$Main_Product_Qtls*100/b$Crop_Area_Ha->b$yield + hist(b$yield,freq=F,main="Histogram of wheat yields",breaks=c(0,1000,1500,2000,2200,2500,3000,3200,3400,3800,4000,5000,10000,21000),ylim=c(0,0.00040)) +#+END_SRC + +*** Boxplot :slide: + +**** Invented by John Tukey in 1970 +**** Many variations proposed since then, though the essential form and idea as remained intact. + + + +*** Boxplot of wheat yields :slide: + +#+RESULTS: ccpc-wheat-box1 +[[file:boxplotyield1.png]] + +#+NAME: ccpc-wheat-box1 +#+BEGIN_SRC R :results output graphics :exports results :file boxplotyield1.png :width 400 :height 300 :type cairo :family Garamond + subset(ccpc,Year_Agriculture==2009)->b + subset(b,Crop_code %in% 20 )->b + b$Main_Product_Qtls*100/b$Crop_Area_Ha->b$yield + boxplot(b$yield,main="Boxplot of wheat yields") +#+END_SRC + +*** Violin plots :slide: + +#+RESULTS: ccpc-wheat-vio1 +[[file:vioplotyield1.png]] + +#+NAME: ccpc-wheat-vio1 +#+BEGIN_SRC R :results output graphics :exports results :file vioplotyield1.png :width 400 :height 300 :type cairo :family Garamond + subset(ccpc,Year_Agriculture==2009)->b + subset(b,Crop_code %in% 20 )->b + library(vioplot) + b$Main_Product_Qtls*100/b$Crop_Area_Ha->b$yield + vioplot(b$yield) +#+END_SRC + + + + +*** Boxplots: Useful to identify extreme values :slide: + + +#+RESULTS: ccpc-wheat-box2 +[[file:boxplotyield2.png]] +#+NAME: ccpc-wheat-box2 +#+BEGIN_SRC R :results output graphics :exports results :file boxplotyield2.png :width 400 :height 300 :type cairo :family Garamond + subset(ccpc,Year_Agriculture==2009)->b + subset(b,Crop_code %in% 20 )->b + b$Main_Product_Qtls*100/b$Crop_Area_Ha->b$yield + boxplot(b$yield,main="Magnified tail of the boxplot",ylim=c(7000,25000)) +#+END_SRC + +*** Boxplots: Useful for comparisons across categories :slide: + +#+RESULTS: ccpc-crop-box3 +[[file:boxplotyield3.png]] +#+NAME: ccpc-crop-box3 +#+BEGIN_SRC R :results output graphics :exports results :file boxplotyield3.png :width 400 :height 280 :type cairo :family Garamond + subset(ccpc,Year_Agriculture==2009)->b + subset(b,Crop_code %in% c(10,20,40,140,150,450,510,680,900))->b + factor(b$Crop_code)->b$Crop_code + levels(b$Crop_code)<-c("Wheat","Paddy","Maize","Bajra","Ragi","Gram","Red gram","Groundnut","Mustard") + b$Main_Product_Qtls*100/b$Crop_Area_Ha->b$yield + boxplot(yield~Crop_code,data=b,main="Boxplots of yields of various crops",las=3,ylim=c(0,8000),outline=F) + #+END_SRC + + +*** Violin plots :slide: + +#+RESULTS: ccpc-crop-vio +[[file:vioplotyield3.png]] + +#+NAME: ccpc-crop-vio +#+BEGIN_SRC R :results output graphics :exports results :file vioplotyield3.png :width 400 :height 280 :type cairo :family Garamond + subset(ccpc,Year_Agriculture==2009)->b + subset(b,Crop_code %in% c(10,20,40,140,150,450,510,680,900))->b + factor(b$Crop_code)->b$Crop_code + levels(b$Crop_code)<-c("Wheat","Paddy","Maize","Bajra","Ragi","Gram","Red gram","Groundnut","Mustard") + b$Main_Product_Qtls*100/b$Crop_Area_Ha->b$yield + + vioplot(b$yield[b$Crop_code=="Wheat"],b$yield[b$Crop_code=="Paddy"],b$yield[b$Crop_code=="Maize"]) + #+END_SRC + + + +* Workshop plan +** Introduction to R +** Data Tables +** ggplot2 + +* Datasets + +** Census +** PLFS/Chandan's data +** Suicides +** CPI +** Pulses + diff --git a/graphics/chart1.png b/graphics/chart1.png new file mode 100644 index 0000000..919206b --- /dev/null +++ b/graphics/chart1.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae3ad29e377aec71c44cc50f89813d63e479b44a74cd96473b4b72998c68069e +size 145682 diff --git a/graphics/chart2.png b/graphics/chart2.png new file mode 100644 index 0000000..13664c3 --- /dev/null +++ b/graphics/chart2.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:706098eeeec998b32d7c87561fb6e74d537e85d03d5398476f1ebb80cd893102 +size 95468 diff --git a/graphics/chart3.png b/graphics/chart3.png new file mode 100644 index 0000000..7968f84 --- /dev/null +++ b/graphics/chart3.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:74e567a2caa6a75ce8d2dc036f6ebfe8117867ffcbd9cbac28c5fe050c8daf02 +size 116299 diff --git a/graphics/nobel-right.png b/graphics/nobel-right.png new file mode 100644 index 0000000..0f4cba8 --- /dev/null +++ b/graphics/nobel-right.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:628cb7ef2cc08848688ef3a59a34d5e3f32dfb750fb1a140f0cb79dc660cacba +size 36412 diff --git a/graphics/nobel-wrong.png b/graphics/nobel-wrong.png new file mode 100644 index 0000000..3d88348 --- /dev/null +++ b/graphics/nobel-wrong.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b3ebcea2e3c3b65b925a160887c32a0bc1439e429a18870e53157114e72e38c6 +size 34715 diff --git a/graphics/piketty1_c.png b/graphics/piketty1_c.png new file mode 100644 index 0000000..0f0d77a --- /dev/null +++ b/graphics/piketty1_c.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb10c9f6c71aa99f15950b74e1c0ee6c00da91ada5901a1d7db9a8c6d7c58897 +size 725335 diff --git a/graphics/piketty1_o.png b/graphics/piketty1_o.png new file mode 100644 index 0000000..79eb09f --- /dev/null +++ b/graphics/piketty1_o.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c5e1ed65cd2e220146ee803fd158dc361c78ca0f8d73af2187c76677feba1790 +size 590567 diff --git a/graphics/piketty2_c.png b/graphics/piketty2_c.png new file mode 100644 index 0000000..b23f7f7 --- /dev/null +++ b/graphics/piketty2_c.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:45afc808f3e4c8ad4f35a0f56630ff70297f6762d330dd53e5df995dc8147411 +size 897235 diff --git a/graphics/piketty2_o.png b/graphics/piketty2_o.png new file mode 100644 index 0000000..06a5404 --- /dev/null +++ b/graphics/piketty2_o.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff839c162ce0e5239dc40578c8ed055f737fadebda9022d7a9f9c45bffc58a72 +size 747076 diff --git a/graphics/tufte-fuel.png b/graphics/tufte-fuel.png new file mode 100644 index 0000000..ec867ec --- /dev/null +++ b/graphics/tufte-fuel.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2d93fb7058e122847fdcb6dd510b818f0b5b2a0d7f23be483e9fc07cfa73e43d +size 91324 diff --git a/graphics/tufte-fuel2.png b/graphics/tufte-fuel2.png new file mode 100644 index 0000000..ebcefe1 --- /dev/null +++ b/graphics/tufte-fuel2.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b0dc7cdb16840095f074ea040f061de9f6218e6ae4eb387c70a8ec19f2eb570 +size 56277 diff --git a/graphics/tufte-insanity.png b/graphics/tufte-insanity.png new file mode 100644 index 0000000..1b57bae --- /dev/null +++ b/graphics/tufte-insanity.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59c3cde3511adb44933e0be46e5a8b2ac317477d4d5fc3806d9ad5c330ef1a60 +size 57977