You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

392 lines
12 KiB

#+TITLE: Quantitative Methods
#+PROPERTY: header-args:R :session acj :eval never-export
#+STARTUP: hideall inlineimages hideblocks
#+SETUPFILE: https://fniessen.github.io/org-html-themes/setup/theme-readtheorg.setup
#+HTML_HEAD: <style>#content{max-width:1200px;} </style>
* Title slide :slide:
#+BEGIN_SRC emacs-lisp-slide
(org-show-animate '("Quantitative Methods, Part-II" "Vikas Rawal" "Prachi Bansal" "" "" ""))
#+END_SRC
* Day 1
** Title slide
#+BEGIN_SRC emacs-lisp-slide
(org-show-animate '("Why do financial journalists need to know quantitative methods?" "" "" ""))
#+END_SRC
** What do we aim to achieve in this course? :slide:
**** Make friends with numbers
**** Learn how to read numbers, how to present them, and how to write about them
**** Learn how to use computers to work with numbers
** Two Types of Statistics :slide:
*** Descriptive Statistics
**** Use summaries of data for the entire population to describe a population
**** Use summaries of sample data to describe a sample
*** Inferential Statistics
**** Use sample data to describe a population
** Descriptive Statistics :slide:
+ Frequency
+ Measures of central tendency
+ Summary positions
+ Measures of dispersion
*** Frequency :slide:
#+NAME: worker-code0
#+begin_src R :results value :export results :colnames yes :hline
library(data.table)
data.table(names=c("Anil","Neeraj","Savita","Srimati",
"Rekha","Pooja","Alex","Shahina",
"Ghazal","Lakshmi","Rahul","Shahrukh",
"Naman","Deepak","Shreya","Rukhsana"
),
salary=c(71,50,65,40,
45,42,46,43,
45,43,45,45,
850,100,46,48
)*1000,
sex=c("M","M","F","F",
"F","F","M","F",
"F","F","M","M",
"M","M","F","F"
))->workers
workers$sno<-c(1:nrow(workers))
workers[,.(sno,names,sex,salary)]
#+end_src
#+RESULTS: worker-code0
| sno | names | sex | salary |
|-----+----------+-----+--------|
| 1 | Anil | M | 71000 |
| 2 | Neeraj | M | 50000 |
| 3 | Savita | F | 65000 |
| 4 | Srimati | F | 40000 |
| 5 | Rekha | F | 45000 |
| 6 | Pooja | F | 42000 |
| 7 | Alex | M | 46000 |
| 8 | Shahina | F | 43000 |
| 9 | Ghazal | F | 45000 |
| 10 | Lakshmi | F | 43000 |
| 11 | Rahul | M | 45000 |
| 12 | Shahrukh | M | 45000 |
| 13 | Naman | M | 850000 |
| 14 | Deepak | M | 1e+05 |
| 15 | Shreya | F | 46000 |
| 16 | Rukhsana | F | 48000 |
#+NAME: freq-code
#+begin_src R :results value :export results :colnames yes :hline
workers[,.(frequency=length(sno)),.(sex)]
#+end_src
#+RESULTS:
| sex | frequency |
|-----+-----------|
| M | 7 |
| F | 9 |
#+RESULTS: freq-code
| sex | frequency |
|-----+-----------|
| M | 7 |
| F | 9 |
*** Measures of Central Tendency :slide:
#+NAME: mid-code
#+begin_src R :results value :export results :colnames yes :hline
workers[,.(mean_salary=round(mean(salary),1),
median_salary=quantile(salary,prob=0.5))]
#+End_src
#+RESULTS: mid-code
| mean_salary | median_salary |
|-------------+---------------|
| 101500 | 45500 |
#+NAME: mid2-code
#+begin_src R :results value :export results :colnames yes :hline
workers[,.(mean_salary=round(mean(salary),1),
median_salary=quantile(salary,prob=0.5)),.(sex)]
#+end_src
#+RESULTS: mid2-code
| sex | mean_salary | median_salary |
|-----+-------------+---------------|
| M | 172428.6 | 50000 |
| F | 46333.3 | 45000 |
*** Measures of Position :slide:
+ First quartile
+ Second quartile (median)
+ Third quartile
+ Deciles
+ Quintiles
+ Percentiles
*** Measures of Dispersion :slide:
**** Range and other measures based on positions :slide:
$range=max-min$
#+RESULTS: range-code
| min_salary | max_salary | range |
|------------+------------+--------|
| 40000 | 850000 | 810000 |
#+NAME: range-code
#+begin_src R :results value :export results :colnames yes :hline
workers[,.(min_salary=min(salary),
max_salary=max(salary),
range=max(salary)-min(salary))]
#+end_src
**** Range and other measures based on positions :slide:
+ Distance between any two positions (Deciles, Quintiles, Percentiles) can be used as a measure of dispersion.
$inter.quartile.range=Q3-Q1$
#+RESULTS: summary-code
#+begin_example
25% 75%
44500 53750
10% 90%
42500 85500
10% 95%
42500 287500
25% 95%
44500 287500
0% 75%
40000 53750
#+end_example
#+NAME: summary-code
#+begin_src R :results output :export results :colnames yes :hline
## summary(workers$salary)
quantile(workers$salary,probs=c(0.25,0.75))
quantile(workers$salary,probs=c(0.1,0.9))
quantile(workers$salary,probs=c(0.1,0.95))
quantile(workers$salary,probs=c(0.25,0.95))
quantile(workers$salary,probs=c(0,0.75))
#+end_src
**** Variance, Standard Deviation and Coefficient of Variation
$variance=\frac{1}{n} \times \sum(x_{i}-x)^{2}$
$standard.deviation = \sqrt{variance}$
$cov=\frac{standard.deviation}{mean}$
#+NAME: var-code
#+begin_src R :results value :export results :colnames yes :hline
workers[,.(var_salary=round(var(salary),1),
sd_salary=round(sqrt(var(salary)),1),
cov_salary=round(sqrt(var(salary))/mean(salary),2))
]
#+end_src
#+RESULTS: var-code
| var_salary | sd_salary | cov_salary |
|-------------+-----------+------------|
| 40075200000 | 200187.9 | 1.97 |
#+NAME: var2-code
#+begin_src R :results value :export results :colnames yes :hline
students[,.(var_salary=round(var(salary),1),
sd_salary=round(sqrt(var(salary)),1),
cov_salary=round(sqrt(var(salary))/mean(salary),2)),.(sex)]
#+end_src
#+RESULTS: var2-code
| sex | var_salary | sd_salary | cov_salary |
|-----+-------------+-----------+------------|
| M | 89680952381 | 299467.8 | 1.74 |
| F | 54500000 | 7382.4 | 0.16 |
** Graphical Displays of Quantitative Information: Common Pitfalls
*** Common uses of statistical graphics :slide:
+ To show trends over time
+ To show mid-point variations across categories
+ To show composition
+ (less commonly, though more usefully) to show/analyse dispersion
*** Mis-representation :slide:
#+CAPTION: "and sometimes the fact that numbers have a magnitude as well as an order is simply forgotten"
[[file:graphics/tufte-insanity.png]]
*** Mis-representation :slide:
#+CAPTION: Another example borrowed from Tufte
[[file:graphics/tufte-fuel.png]]
*** Mis-representation :slide:
#+CAPTION: Tufte's graph on fuel economy of cars
#+attr_html: :width 400px
[[file:graphics/tufte-fuel2.png]]
*** Mis-representation :slide:
#+CAPTION: Nobel prizes awarded in science (National Science Foundation, 1974)
#+attr_html: :width 300px
[[file:graphics/nobel-wrong.png]]
*** Mis-representation :slide:
#+CAPTION: Nobel prizes awarded in science (corrected by Tufte)
#+attr_html: :width 300px
[[file:graphics/nobel-right.png]]
*** Mis-representation: illustrations from Thomas Piketty's work (source Noah Wright) :slide:
[[file:graphics/piketty1_o.png]]
*** Mis-representation: illustrations from Thomas Piketty's work (source Noah Wright) :slide:
[[file:graphics/piketty1_c.png]]
*** Mis-representation: illustrations from Thomas Piketty's work (source Noah Wright) :slide:
[[file:graphics/piketty2_o.png]]
*** Mis-representation: illustrations from Thomas Piketty's work (source Noah Wright) :slide:
[[file:graphics/piketty2_c.png]]
*** The problem multiplied with the coming in of spreadsheets :slide:
#+ATTR_html: :width 300px
[[file:graphics/chart1.png]]
#+ATTR_html: :width 300px
[[file:graphics/chart2.png]]
#+ATTR_html: :width 300px
[[file:graphics/chart3.png]]
** Graphical Displays of Quantitative Information: Dispersion :slide:
*** Histogram :slide:
#+RESULTS: ccpc-wheat-hist1
#+attr_html: :width 800px
[[file:productionhist1.png]]
#+NAME: ccpc-wheat-hist1
#+BEGIN_SRC R :results output graphics :exports results :file productionhist1.png :width 400 :height 300 :type cairo :family Garamond
subset(ccpc,Year_Agriculture==2009)->b
subset(b,Crop_code %in% 20 )->b
b$Main_Product_Qtls*100/b$Crop_Area_Ha->b$yield
hist(b$yield,main="Histogram of wheat yields",ylim=c(0,4000))
#+END_SRC
*** Histogram with relative densities :slide:
#+RESULTS: ccpc-wheat-hist2
#+attr_html: :width 600px
[[file:productionhist2.png]]
#+NAME: ccpc-wheat-hist2
#+BEGIN_SRC R :results output graphics :exports results :file productionhist2.png :width 400 :height 300 :type cairo :family Garamond
subset(ccpc,Year_Agriculture==2009)->b
subset(b,Crop_code %in% 20)->b
b$Main_Product_Qtls*100/b$Crop_Area_Ha->b$yield
hist(b$yield,freq=F,main="Histogram of wheat yields",ylim=c(0,0.00040))
#+END_SRC
*** Boxplot :slide:
+ Invented by John Tukey in 1970
+ Many variations proposed since then, though the essential form and idea as remained intact.
*** Boxplot of wheat yields :slide:
#+RESULTS: ccpc-wheat-box1
[[file:boxplotyield1.png]]
#+NAME: ccpc-wheat-box1
#+BEGIN_SRC R :results output graphics :exports results :file boxplotyield1.png :width 400 :height 300 :type cairo :family Garamond
subset(ccpc,Year_Agriculture==2009)->b
subset(b,Crop_code %in% 20 )->b
b$Main_Product_Qtls*100/b$Crop_Area_Ha->b$yield
boxplot(b$yield,main="Boxplot of wheat yields")
#+END_SRC
*** Violin plots :slide:
#+RESULTS: ccpc-wheat-vio1
[[file:vioplotyield1.png]]
#+NAME: ccpc-wheat-vio1
#+BEGIN_SRC R :results output graphics :exports results :file vioplotyield1.png :width 400 :height 300 :type cairo :family Garamond
subset(ccpc,Year_Agriculture==2009)->b
subset(b,Crop_code %in% 20 )->b
library(vioplot)
b$Main_Product_Qtls*100/b$Crop_Area_Ha->b$yield
vioplot(b$yield)
#+END_SRC
*** Boxplots: Useful to identify extreme values :slide:
#+RESULTS: ccpc-wheat-box2
[[file:boxplotyield2.png]]
#+NAME: ccpc-wheat-box2
#+BEGIN_SRC R :results output graphics :exports results :file boxplotyield2.png :width 400 :height 300 :type cairo :family Garamond
subset(ccpc,Year_Agriculture==2009)->b
subset(b,Crop_code %in% 20 )->b
b$Main_Product_Qtls*100/b$Crop_Area_Ha->b$yield
boxplot(b$yield,main="Magnified tail of the boxplot",ylim=c(7000,25000))
#+END_SRC
*** Boxplots: Useful for comparisons across categories :slide:
#+RESULTS: ccpc-crop-box3
[[file:boxplotyield3.png]]
#+NAME: ccpc-crop-box3
#+BEGIN_SRC R :results output graphics :exports results :file boxplotyield3.png :width 400 :height 280 :type cairo :family Garamond
subset(ccpc,Year_Agriculture==2009)->b
subset(b,Crop_code %in% c(10,20,40,140,150,450,510,680,900))->b
factor(b$Crop_code)->b$Crop_code
levels(b$Crop_code)<-c("Wheat","Paddy","Maize","Bajra","Ragi","Gram","Red gram","Groundnut","Mustard")
b$Main_Product_Qtls*100/b$Crop_Area_Ha->b$yield
boxplot(yield~Crop_code,data=b,main="Boxplots of yields of various crops",las=3,ylim=c(0,8000),outline=F)
#+END_SRC
*** Violin plots :slide:
#+RESULTS: ccpc-crop-vio
[[file:vioplotyield3.png]]
#+NAME: ccpc-crop-vio
#+BEGIN_SRC R :results output graphics :exports results :file vioplotyield3.png :width 400 :height 280 :type cairo :family Garamond
subset(ccpc,Year_Agriculture==2009)->b
subset(b,Crop_code %in% c(10,20,40,140,150,450,510,680,900))->b
factor(b$Crop_code)->b$Crop_code
levels(b$Crop_code)<-c("Wheat","Paddy","Maize","Bajra","Ragi","Gram","Red gram","Groundnut","Mustard")
b$Main_Product_Qtls*100/b$Crop_Area_Ha->b$yield
vioplot(b$yield[b$Crop_code=="Wheat"],b$yield[b$Crop_code=="Paddy"],b$yield[b$Crop_code=="Maize"])
#+END_SRC