You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
392 lines
12 KiB
392 lines
12 KiB
#+TITLE: Quantitative Methods
|
|
#+PROPERTY: header-args:R :session acj :eval never-export
|
|
#+STARTUP: hideall inlineimages hideblocks
|
|
#+SETUPFILE: https://fniessen.github.io/org-html-themes/setup/theme-readtheorg.setup
|
|
#+HTML_HEAD: <style>#content{max-width:1200px;} </style>
|
|
|
|
* Title slide :slide:
|
|
#+BEGIN_SRC emacs-lisp-slide
|
|
(org-show-animate '("Quantitative Methods, Part-II" "Vikas Rawal" "Prachi Bansal" "" "" ""))
|
|
#+END_SRC
|
|
* Day 1
|
|
** Title slide
|
|
#+BEGIN_SRC emacs-lisp-slide
|
|
(org-show-animate '("Why do financial journalists need to know quantitative methods?" "" "" ""))
|
|
#+END_SRC
|
|
|
|
** What do we aim to achieve in this course? :slide:
|
|
**** Make friends with numbers
|
|
**** Learn how to read numbers, how to present them, and how to write about them
|
|
**** Learn how to use computers to work with numbers
|
|
** Two Types of Statistics :slide:
|
|
*** Descriptive Statistics
|
|
**** Use summaries of data for the entire population to describe a population
|
|
**** Use summaries of sample data to describe a sample
|
|
*** Inferential Statistics
|
|
**** Use sample data to describe a population
|
|
** Descriptive Statistics :slide:
|
|
+ Frequency
|
|
+ Measures of central tendency
|
|
+ Summary positions
|
|
+ Measures of dispersion
|
|
|
|
*** Frequency :slide:
|
|
|
|
#+NAME: worker-code0
|
|
#+begin_src R :results value :export results :colnames yes :hline
|
|
library(data.table)
|
|
data.table(names=c("Anil","Neeraj","Savita","Srimati",
|
|
"Rekha","Pooja","Alex","Shahina",
|
|
"Ghazal","Lakshmi","Rahul","Shahrukh",
|
|
"Naman","Deepak","Shreya","Rukhsana"
|
|
),
|
|
salary=c(71,50,65,40,
|
|
45,42,46,43,
|
|
45,43,45,45,
|
|
850,100,46,48
|
|
)*1000,
|
|
sex=c("M","M","F","F",
|
|
"F","F","M","F",
|
|
"F","F","M","M",
|
|
"M","M","F","F"
|
|
))->workers
|
|
workers$sno<-c(1:nrow(workers))
|
|
workers[,.(sno,names,sex,salary)]
|
|
#+end_src
|
|
|
|
#+RESULTS: worker-code0
|
|
| sno | names | sex | salary |
|
|
|-----+----------+-----+--------|
|
|
| 1 | Anil | M | 71000 |
|
|
| 2 | Neeraj | M | 50000 |
|
|
| 3 | Savita | F | 65000 |
|
|
| 4 | Srimati | F | 40000 |
|
|
| 5 | Rekha | F | 45000 |
|
|
| 6 | Pooja | F | 42000 |
|
|
| 7 | Alex | M | 46000 |
|
|
| 8 | Shahina | F | 43000 |
|
|
| 9 | Ghazal | F | 45000 |
|
|
| 10 | Lakshmi | F | 43000 |
|
|
| 11 | Rahul | M | 45000 |
|
|
| 12 | Shahrukh | M | 45000 |
|
|
| 13 | Naman | M | 850000 |
|
|
| 14 | Deepak | M | 1e+05 |
|
|
| 15 | Shreya | F | 46000 |
|
|
| 16 | Rukhsana | F | 48000 |
|
|
|
|
#+NAME: freq-code
|
|
|
|
#+begin_src R :results value :export results :colnames yes :hline
|
|
workers[,.(frequency=length(sno)),.(sex)]
|
|
#+end_src
|
|
|
|
#+RESULTS:
|
|
| sex | frequency |
|
|
|-----+-----------|
|
|
| M | 7 |
|
|
| F | 9 |
|
|
|
|
#+RESULTS: freq-code
|
|
| sex | frequency |
|
|
|-----+-----------|
|
|
| M | 7 |
|
|
| F | 9 |
|
|
|
|
*** Measures of Central Tendency :slide:
|
|
|
|
#+NAME: mid-code
|
|
#+begin_src R :results value :export results :colnames yes :hline
|
|
workers[,.(mean_salary=round(mean(salary),1),
|
|
median_salary=quantile(salary,prob=0.5))]
|
|
#+End_src
|
|
|
|
#+RESULTS: mid-code
|
|
| mean_salary | median_salary |
|
|
|-------------+---------------|
|
|
| 101500 | 45500 |
|
|
|
|
#+NAME: mid2-code
|
|
#+begin_src R :results value :export results :colnames yes :hline
|
|
workers[,.(mean_salary=round(mean(salary),1),
|
|
median_salary=quantile(salary,prob=0.5)),.(sex)]
|
|
#+end_src
|
|
|
|
#+RESULTS: mid2-code
|
|
| sex | mean_salary | median_salary |
|
|
|-----+-------------+---------------|
|
|
| M | 172428.6 | 50000 |
|
|
| F | 46333.3 | 45000 |
|
|
|
|
*** Measures of Position :slide:
|
|
+ First quartile
|
|
+ Second quartile (median)
|
|
+ Third quartile
|
|
|
|
+ Deciles
|
|
+ Quintiles
|
|
+ Percentiles
|
|
|
|
*** Measures of Dispersion :slide:
|
|
|
|
**** Range and other measures based on positions :slide:
|
|
|
|
|
|
$range=max-min$
|
|
|
|
#+RESULTS: range-code
|
|
| min_salary | max_salary | range |
|
|
|------------+------------+--------|
|
|
| 40000 | 850000 | 810000 |
|
|
|
|
#+NAME: range-code
|
|
#+begin_src R :results value :export results :colnames yes :hline
|
|
workers[,.(min_salary=min(salary),
|
|
max_salary=max(salary),
|
|
range=max(salary)-min(salary))]
|
|
#+end_src
|
|
|
|
**** Range and other measures based on positions :slide:
|
|
|
|
+ Distance between any two positions (Deciles, Quintiles, Percentiles) can be used as a measure of dispersion.
|
|
|
|
$inter.quartile.range=Q3-Q1$
|
|
|
|
#+RESULTS: summary-code
|
|
#+begin_example
|
|
25% 75%
|
|
44500 53750
|
|
10% 90%
|
|
42500 85500
|
|
10% 95%
|
|
42500 287500
|
|
25% 95%
|
|
44500 287500
|
|
0% 75%
|
|
40000 53750
|
|
#+end_example
|
|
|
|
#+NAME: summary-code
|
|
#+begin_src R :results output :export results :colnames yes :hline
|
|
## summary(workers$salary)
|
|
quantile(workers$salary,probs=c(0.25,0.75))
|
|
quantile(workers$salary,probs=c(0.1,0.9))
|
|
quantile(workers$salary,probs=c(0.1,0.95))
|
|
quantile(workers$salary,probs=c(0.25,0.95))
|
|
quantile(workers$salary,probs=c(0,0.75))
|
|
#+end_src
|
|
|
|
|
|
|
|
**** Variance, Standard Deviation and Coefficient of Variation
|
|
|
|
$variance=\frac{1}{n} \times \sum(x_{i}-x)^{2}$
|
|
|
|
$standard.deviation = \sqrt{variance}$
|
|
|
|
$cov=\frac{standard.deviation}{mean}$
|
|
|
|
#+NAME: var-code
|
|
#+begin_src R :results value :export results :colnames yes :hline
|
|
workers[,.(var_salary=round(var(salary),1),
|
|
sd_salary=round(sqrt(var(salary)),1),
|
|
cov_salary=round(sqrt(var(salary))/mean(salary),2))
|
|
]
|
|
#+end_src
|
|
|
|
#+RESULTS: var-code
|
|
| var_salary | sd_salary | cov_salary |
|
|
|-------------+-----------+------------|
|
|
| 40075200000 | 200187.9 | 1.97 |
|
|
|
|
#+NAME: var2-code
|
|
#+begin_src R :results value :export results :colnames yes :hline
|
|
students[,.(var_salary=round(var(salary),1),
|
|
sd_salary=round(sqrt(var(salary)),1),
|
|
cov_salary=round(sqrt(var(salary))/mean(salary),2)),.(sex)]
|
|
#+end_src
|
|
|
|
#+RESULTS: var2-code
|
|
| sex | var_salary | sd_salary | cov_salary |
|
|
|-----+-------------+-----------+------------|
|
|
| M | 89680952381 | 299467.8 | 1.74 |
|
|
| F | 54500000 | 7382.4 | 0.16 |
|
|
|
|
|
|
** Graphical Displays of Quantitative Information: Common Pitfalls
|
|
|
|
*** Common uses of statistical graphics :slide:
|
|
+ To show trends over time
|
|
+ To show mid-point variations across categories
|
|
+ To show composition
|
|
+ (less commonly, though more usefully) to show/analyse dispersion
|
|
|
|
*** Mis-representation :slide:
|
|
|
|
#+CAPTION: "and sometimes the fact that numbers have a magnitude as well as an order is simply forgotten"
|
|
[[file:graphics/tufte-insanity.png]]
|
|
|
|
*** Mis-representation :slide:
|
|
|
|
#+CAPTION: Another example borrowed from Tufte
|
|
[[file:graphics/tufte-fuel.png]]
|
|
|
|
*** Mis-representation :slide:
|
|
|
|
#+CAPTION: Tufte's graph on fuel economy of cars
|
|
#+attr_html: :width 400px
|
|
[[file:graphics/tufte-fuel2.png]]
|
|
|
|
*** Mis-representation :slide:
|
|
|
|
#+CAPTION: Nobel prizes awarded in science (National Science Foundation, 1974)
|
|
#+attr_html: :width 300px
|
|
[[file:graphics/nobel-wrong.png]]
|
|
|
|
*** Mis-representation :slide:
|
|
|
|
#+CAPTION: Nobel prizes awarded in science (corrected by Tufte)
|
|
#+attr_html: :width 300px
|
|
[[file:graphics/nobel-right.png]]
|
|
|
|
*** Mis-representation: illustrations from Thomas Piketty's work (source Noah Wright) :slide:
|
|
|
|
[[file:graphics/piketty1_o.png]]
|
|
|
|
*** Mis-representation: illustrations from Thomas Piketty's work (source Noah Wright) :slide:
|
|
|
|
[[file:graphics/piketty1_c.png]]
|
|
|
|
*** Mis-representation: illustrations from Thomas Piketty's work (source Noah Wright) :slide:
|
|
|
|
[[file:graphics/piketty2_o.png]]
|
|
|
|
*** Mis-representation: illustrations from Thomas Piketty's work (source Noah Wright) :slide:
|
|
|
|
[[file:graphics/piketty2_c.png]]
|
|
|
|
*** The problem multiplied with the coming in of spreadsheets :slide:
|
|
|
|
#+ATTR_html: :width 300px
|
|
[[file:graphics/chart1.png]]
|
|
|
|
#+ATTR_html: :width 300px
|
|
[[file:graphics/chart2.png]]
|
|
|
|
#+ATTR_html: :width 300px
|
|
[[file:graphics/chart3.png]]
|
|
|
|
** Graphical Displays of Quantitative Information: Dispersion :slide:
|
|
*** Histogram :slide:
|
|
|
|
#+RESULTS: ccpc-wheat-hist1
|
|
#+attr_html: :width 800px
|
|
[[file:productionhist1.png]]
|
|
|
|
#+NAME: ccpc-wheat-hist1
|
|
#+BEGIN_SRC R :results output graphics :exports results :file productionhist1.png :width 400 :height 300 :type cairo :family Garamond
|
|
subset(ccpc,Year_Agriculture==2009)->b
|
|
subset(b,Crop_code %in% 20 )->b
|
|
b$Main_Product_Qtls*100/b$Crop_Area_Ha->b$yield
|
|
hist(b$yield,main="Histogram of wheat yields",ylim=c(0,4000))
|
|
#+END_SRC
|
|
|
|
*** Histogram with relative densities :slide:
|
|
|
|
#+RESULTS: ccpc-wheat-hist2
|
|
#+attr_html: :width 600px
|
|
[[file:productionhist2.png]]
|
|
|
|
#+NAME: ccpc-wheat-hist2
|
|
#+BEGIN_SRC R :results output graphics :exports results :file productionhist2.png :width 400 :height 300 :type cairo :family Garamond
|
|
subset(ccpc,Year_Agriculture==2009)->b
|
|
subset(b,Crop_code %in% 20)->b
|
|
b$Main_Product_Qtls*100/b$Crop_Area_Ha->b$yield
|
|
hist(b$yield,freq=F,main="Histogram of wheat yields",ylim=c(0,0.00040))
|
|
#+END_SRC
|
|
|
|
*** Boxplot :slide:
|
|
+ Invented by John Tukey in 1970
|
|
+ Many variations proposed since then, though the essential form and idea as remained intact.
|
|
|
|
|
|
*** Boxplot of wheat yields :slide:
|
|
|
|
#+RESULTS: ccpc-wheat-box1
|
|
[[file:boxplotyield1.png]]
|
|
|
|
#+NAME: ccpc-wheat-box1
|
|
#+BEGIN_SRC R :results output graphics :exports results :file boxplotyield1.png :width 400 :height 300 :type cairo :family Garamond
|
|
subset(ccpc,Year_Agriculture==2009)->b
|
|
subset(b,Crop_code %in% 20 )->b
|
|
b$Main_Product_Qtls*100/b$Crop_Area_Ha->b$yield
|
|
boxplot(b$yield,main="Boxplot of wheat yields")
|
|
#+END_SRC
|
|
|
|
*** Violin plots :slide:
|
|
|
|
#+RESULTS: ccpc-wheat-vio1
|
|
[[file:vioplotyield1.png]]
|
|
|
|
#+NAME: ccpc-wheat-vio1
|
|
#+BEGIN_SRC R :results output graphics :exports results :file vioplotyield1.png :width 400 :height 300 :type cairo :family Garamond
|
|
subset(ccpc,Year_Agriculture==2009)->b
|
|
subset(b,Crop_code %in% 20 )->b
|
|
library(vioplot)
|
|
b$Main_Product_Qtls*100/b$Crop_Area_Ha->b$yield
|
|
vioplot(b$yield)
|
|
#+END_SRC
|
|
|
|
|
|
|
|
|
|
*** Boxplots: Useful to identify extreme values :slide:
|
|
|
|
|
|
#+RESULTS: ccpc-wheat-box2
|
|
[[file:boxplotyield2.png]]
|
|
#+NAME: ccpc-wheat-box2
|
|
#+BEGIN_SRC R :results output graphics :exports results :file boxplotyield2.png :width 400 :height 300 :type cairo :family Garamond
|
|
subset(ccpc,Year_Agriculture==2009)->b
|
|
subset(b,Crop_code %in% 20 )->b
|
|
b$Main_Product_Qtls*100/b$Crop_Area_Ha->b$yield
|
|
boxplot(b$yield,main="Magnified tail of the boxplot",ylim=c(7000,25000))
|
|
#+END_SRC
|
|
|
|
*** Boxplots: Useful for comparisons across categories :slide:
|
|
|
|
#+RESULTS: ccpc-crop-box3
|
|
[[file:boxplotyield3.png]]
|
|
#+NAME: ccpc-crop-box3
|
|
#+BEGIN_SRC R :results output graphics :exports results :file boxplotyield3.png :width 400 :height 280 :type cairo :family Garamond
|
|
subset(ccpc,Year_Agriculture==2009)->b
|
|
subset(b,Crop_code %in% c(10,20,40,140,150,450,510,680,900))->b
|
|
factor(b$Crop_code)->b$Crop_code
|
|
levels(b$Crop_code)<-c("Wheat","Paddy","Maize","Bajra","Ragi","Gram","Red gram","Groundnut","Mustard")
|
|
b$Main_Product_Qtls*100/b$Crop_Area_Ha->b$yield
|
|
boxplot(yield~Crop_code,data=b,main="Boxplots of yields of various crops",las=3,ylim=c(0,8000),outline=F)
|
|
#+END_SRC
|
|
|
|
*** Violin plots :slide:
|
|
|
|
#+RESULTS: ccpc-crop-vio
|
|
[[file:vioplotyield3.png]]
|
|
|
|
#+NAME: ccpc-crop-vio
|
|
#+BEGIN_SRC R :results output graphics :exports results :file vioplotyield3.png :width 400 :height 280 :type cairo :family Garamond
|
|
subset(ccpc,Year_Agriculture==2009)->b
|
|
subset(b,Crop_code %in% c(10,20,40,140,150,450,510,680,900))->b
|
|
factor(b$Crop_code)->b$Crop_code
|
|
levels(b$Crop_code)<-c("Wheat","Paddy","Maize","Bajra","Ragi","Gram","Red gram","Groundnut","Mustard")
|
|
b$Main_Product_Qtls*100/b$Crop_Area_Ha->b$yield
|
|
|
|
vioplot(b$yield[b$Crop_code=="Wheat"],b$yield[b$Crop_code=="Paddy"],b$yield[b$Crop_code=="Maize"])
|
|
#+END_SRC
|
|
|
|
|
|
|
|
|
|
|