Code Snippets
str <- "intro to statistics"
str
str <- "intro to statistics"
nchar(str)
toupper(str)
substr(str,3,12)
x <- 6
y <- exp(1)
paste("The value of x is", x, "and the value of y is", round(y,4))
paste("The value of x is", x, "and the value of y is", round(y,4), sep="")
str2 <- paste(str,str)
str2
answers <- "ACBBAD"
grepl("AA", answers)
grepl("BB", answers)
grepl("AAA", answers)
grepl("AAA", answers) | grepl("BBB", answers) | grepl("CCC", answers) | grepl("DDD", answers)
numvec <- c(8,12,5,10,3)
numvec
numvec <- c(numvec,18)
numvec
length(numvec)
answers <- c("A","C","B","B","A","D")
answers
tfvec <- c(TRUE,FALSE,FALSE,TRUE)
tfvec
length(tfvec)
c(8,12,"A")
seq(1,10)
seq(1,10,2)
seq(5,4,-0.1)
1:6
1.2:6
6:1
vec <- c(1:6,seq(10,20,2))
vec
rep(0,5)
rep(1,5)
rep(FALSE,8)
rep("abcd",6)
numvec <- c(8,12,5,10,3,18,7,10,2,8)
length(numvec)
numvec[3]
numvec[2:5]
numvec[(length(numvec)-2):length(numvec)]
numvec[c(4,1,9)]
sqrt(numvec)[2:4]
numvec <- c(8,12,5,10,3,18,7,10,2,8)
min(numvec)
max(numvec)
sort(numvec)
sort(numvec, decreasing = TRUE)
unique(numvec)
sort(unique(numvec))
sum(numvec)
mean(numvec)
cumsum(numvec)
numvec <- c(8,12,5,10,3,18,7,10,2,8)
numvec + 1
numvec + 1:10
sqrt(numvec)
numvec >= 9
prices <- c(1.24,3.12,0.78,2.22,4.57,2.89,4.08,1.83,3.78,2.66)
numvec*prices
revenue <- sum(numvec*prices)
revenue
strvec <- c("ab","cd","ef")
toupper(strvec)
paste(strvec,".",sep="")
strvec=="ab"
logicvec1 <- c(TRUE,FALSE,FALSE,TRUE)
logicvec2 <- c(TRUE,TRUE,FALSE,TRUE)
logicvec1|logicvec2
logicvec1&logicvec2
numvec <- c(8,12,5,10,3,18,7,10,2,8)
prices <- c(1.24,3.12,0.78,2.22,4.57,2.89,4.08,1.83,3.78,2.66)
all(numvec >= 9)
any(numvec >= 9)
ifelse(prices > 3.50, "high price", "low price")
ifelse(prices > 3.50, 3.50, prices)
which(prices > 3.50)
high_prices <- prices[which(prices > 3.50)]
high_prices
sum(numvec >= 9)
sum(prices > 3.50)
mean(numvec >= 9)
mean(prices > 3.50)
set.seed(1234)
x <- runif(1000)
x[1:10]
sum(x<0.3)
mean(x<0.3)
mean((x>0.6)*(x<0.8))
x <- 5
x
x/3
# if (logical condition) {
# ... commands ...
# }
str <- "intro to statistics"
if (nchar(str) > 10) {
print(paste(str,"is a long string."))
print(paste(str,"has",nchar(str),"characters."))
}
# if (logical condition) {
# ... commands executed if condition is TRUE...
# } else {
# ... commands executed if condition is FALSE...
# }
price <- 25.78
if (price > 20) {
print("The price is too high.")
price <- 0.9*price
} else {
print("The price is not too high.")
}
price <- 25.78
if (price > 20) {
print("The price is too high.")
price <- 0.9*price
} else if (price < 10) {
print("The price is too low.")
price <- 1.1*price
} else {
print("The price is not too high or too low.")
}
# for (var in sequence) {
# ... commands ...
# }
sequence_length <- 10
fib_sequence <- rep(0,sequence_length)
fib_sequence[1] <- 1
fib_sequence[2] <- 1
for (i in 3:sequence_length) {
fib_sequence[i] <- fib_sequence[i-2]+fib_sequence[i-1]
}
print(fib_sequence)
strvec <- c("cow","horse","pig","chicken","goat")
first_letters <- ""
for (str in strvec) {
first_letters <- paste(first_letters,substr(str,1,1),sep="")
}
print(first_letters)
# while (logical condition) {
# ... commands ...
# }
sales <- c(38,52,24,61,47,18,29,44,41)
total_sales <- 0
idx <- 0
while (total_sales <= 200) {
idx <- idx + 1
total_sales <- total_sales + sales[idx]
}
print(paste("It took",idx,"days to reach over 200 sales. There were",total_sales,"sales after",idx,"days."))
triangle_area <- function(base, height) {
area <- 0.5*base*height
return(area)
}
triangle_area(10,5)
area
height <- 8
triangle_area(10,5)
height
base_vec <- c(10,15,20)
height_vec <- c(5,8,10)
triangle_area(base_vec,height_vec)
distance <- function(x1, y1, x2 = 0, y2 = 0) {
return(sqrt((x1-x2)^2+(y1-y2)^2))
}
distance(3,4)
distance(3,4,5,-1)
df <- data.frame(name = c("Amy", "Blake", "Chloe"), age = c(28, 41, 32), employed = c("Yes", "No", "Yes"))
print(df)
mean(df$age)
sapply(df,class)
df$employed <- factor(df$employed)
sapply(df,class)
df$name
df$age[1]
df[1,2]
df[,2]
df[,c("name","employed")]
df[1:2,]
df[df$age>30,]
setwd("/Users/ja8294/Library/CloudStorage/Box-Box/ECO 329 book/data")
exams <- read.csv("exams.csv")
exams[5:9,]
print(paste("Top score on exam #1:",max(exams$exam1)))
print(paste("Top score on exam #2:",max(exams$exam2)))
head(exams)
str(exams)
summary(exams)
names(exams)
exams$avg <- (exams$exam1+exams$exam2)/2
str(exams)
write.csv(exams,file="exams-edited.csv")
hourly_wage <- c(12.50, NA, 10.75, 11.00, NA, NA, 14.80, 13.25)
age <- c(24, 42, 31, 61, 55, 26, 34, 59)
is.na(hourly_wage)
hourly_wage[!is.na(hourly_wage)]
sum(is.na(hourly_wage))
worker_df <- data.frame(wagehr = hourly_wage, age = age)
na.omit(worker_df)
mean(hourly_wage)
mean(hourly_wage, na.rm = TRUE)
max(hourly_wage, na.rm = TRUE)
r = getOption("repos")
r["CRAN"] = "http://cran.us.r-project.org"
options(repos = r)
install.packages("stringr")
library(stringr)
library(help = stringr)
str_trim(" testing this out ")
x <- 16
sqrt(x)
x <- 16
x <- sqrt(x)
x <- c(x,x)
x <- c(79,16,53,44)
x <- sort(x, decreasing = TRUE)
length(x)
y <- c(6.7,-3.3,4.2)
x <- (1:3)*y
min(x)
x <- (x > 0)
sum(x)
sales <- c(38,52,24,61,47,18,29,44,41)
returns <- "UDUUUDDUDUDUUUUDUDUU"
setwd("/Users/ja8294/Library/CloudStorage/Box-Box/ECO 329 book/data")
exams <- read.csv("exams.csv")
cigdata <- read.csv("cigdata2019.csv")
sample(c("H","T"), 10, replace = TRUE)
set.seed(1234)
cointosses <- sample(c("H","T"), 10000, replace = TRUE)
mean(cointosses=="H")
set.seed(1234)
# toss a coin 10,000 times
cointosses <- sample(c("H","T"), 10000, replace = TRUE)
# show the results of the first 10 tosses
cointosses[1:10]
# calculate the cumulative sum of heads
# (e.g. cumul_heads[100] is number of heads through first 100 tosses)
cumul_heads <- cumsum(cointosses=="H")
# calculate the cumulative frequency of heads
# (e.g. freqheads[100] is the frequency (fraction) of heads through first 100 tosses)
freq_heads <- cumul_heads/(1:10000)
# plot the freqheads vector
plot(freq_heads,xlab="Simulation number",ylab="Cumulative frequency of heads",cex=0.5)
abline(h=0.5,col="blue",lty=3)
set.seed(1234)
# toss two coins 10,000 times
coin1tosses <- sample(c("H","T"), 10000, replace=TRUE)
coin2tosses <- sample(c("H","T"), 10000, replace=TRUE)
# calculate the cumulative number of "two heads" occurrences
cumul_twoheads <- cumsum(coin1tosses=="H" & coin2tosses=="H")
# calculate the cumulative frequency of "two heads" occurrences
freq_twoheads <- cumul_twoheads/(1:10000)
# plot the freq_twoheads vector
plot(freq_twoheads,xlab="Simulation number",ylab="Cumulative frequency of two-heads",cex=0.5)
abline(h=0.25,col="blue",lty=3)
set.seed(1234)
coin1tosses <- sample(c("H","T"), 10000, replace=TRUE)
coin2tosses <- sample(c("H","T"), 10000, replace=TRUE)
mean(coin1tosses=="H" & coin2tosses=="H")
set.seed(1234)
# roll six-sided die 10,000 times
dierolls <- sample(1:6, 10000, replace = TRUE)
# show the results of the first 10 rolls
dierolls[1:10]
# calculate the cumulative number of "six" rolls
cumul_six <- cumsum(dierolls==6)
# calculate the cumulative frequency of "six" rolls
freq_six <- cumul_six/(1:10000)
# plot the freq_six vector
plot(freq_six,xlab="Simulation number",ylab="Cumulative frequency of sixes",cex=0.5)
abline(h=1/6,col="blue",lty=3)
set.seed(1234)
dierolls <- sample(1:6, 10000, replace = TRUE)
mean(dierolls==6)
options(scipen=999)
set.seed(1234)
# initialize the number of simulations and number of tosses
num_simulations <- 100000
num_tosses <- 100
# initialize a vector to store the cumulative count of streaks
cumul_streaks <- rep(0, num_simulations)
# initialize a counter for the number of streaks observed
streak_counter <- 0
# loop with simulated coin tosses and check for streak
for (i in 1:num_simulations) {
# simulate the coin tosses
tosses <- sample(c("H","T"), num_tosses, replace = TRUE)
# paste together the toss outcomes into a single string
toss_string <- paste(tosses, collapse = "")
# add one to streak counter if streak of five heads occurs
streak_counter <- streak_counter + grepl("HHHHH", toss_string)
cumul_streaks[i] <- streak_counter
}
# create a vector of the cumulative frequencies
freq_streaks <- cumul_streaks/(1:num_simulations)
# output the final frequency (approximated probability)
freq_streaks[num_simulations]
# plot the cumulative frequencies against the simulation number
plot(freq_streaks[1000:num_simulations], xlab="Simulation number",
ylab="Cumulative frequency of five-head streak occurrence", cex=0.5, ylim=c(0.77,0.83))
table(cps$lfstatus)
table(cps$lfstatus)/nrow(cps)
# graph-display format (two rows, one column)
par(mfrow = c(2,1))
# bar chart with sample counts
barplot(table(cps$lfstatus), ylim=c(0,3000), main="Bar chart (counts) of labor-force status")
# bar chart with sample proportions
barplot(table(cps$lfstatus)/nrow(cps), ylim=c(0,0.8), main="Bar chart (proportions) of labor-force status")
# graph-display format (two rows, two columns)
par(mfrow = c(2,2))
# histogram of age with bin width = one year, counts
hist(cps$age, breaks=seq(29.5,59.5,1), main="Bin width = 1 yr, counts", xlab="Age")
# histogram of age with bin width = one year, density
hist(cps$age, breaks=seq(29.5,59.5,1), freq=FALSE, main="Bin width = 1 yr, density", xlab="Age")
# histogram of age with bin width = one year, counts
hist(cps$age, breaks=seq(29.5,59.5,2), main="Bin width = 2 yrs, counts", xlab="Age")
# histogram of age with bin width = one year, density
hist(cps$age, breaks=seq(29.5,59.5,2), freq=FALSE, main="Bin width = 2 yrs, density", xlab="Age")
# create subsample of the data of only "Employed" individuals
cpsemployed <- cps[cps$lfstatus=="Employed",]
# show the number of employed individuals
nrow(cpsemployed)
# graph-display format (three rows, two columns)
par(mfrow = c(3,2))
# histogram of weekly earnings with 10 bins (density on y-axis)
hist(cpsemployed$earnwk, breaks=10, freq=FALSE, main="Weekly earnings, 10 bins", xlab="Weekly earnings")
# histogram of weekly earnings with 20 bins (density on y-axis)
hist(cpsemployed$earnwk, breaks=20, freq=FALSE, main="Weekly earnings, 20 bins", xlab="Weekly earnings")
# histogram of weekly earnings with 50 bins (density on y-axis)
hist(cpsemployed$earnwk, breaks=50, freq=FALSE, main="Weekly earnings, 50 bins", xlab="Weekly earnings")
# histogram of weekly earnings with 100 bins (density on y-axis)
hist(cpsemployed$earnwk, breaks=100, freq=FALSE, main="Weekly earnings, 100 bins", xlab="Weekly earnings")
# histogram of weekly earnings with 200 bins (density on y-axis)
hist(cpsemployed$earnwk, breaks=200, freq=FALSE, main="Weekly earnings, 200 bins", xlab="Weekly earnings")
# histogram of weekly earnings with 500 bins (density on y-axis)
hist(cpsemployed$earnwk, breaks=500, freq=FALSE, main="Weekly earnings, 500 bins", xlab="Weekly earnings")
# histogram of weekly earnings with 92 bins, with estimated density overlaid
hist(cpsemployed$earnwk, breaks=92, freq=FALSE, main="", xlab="Weekly earnings")
lines(density(cpsemployed$earnwk), col="blue", lwd=2)
mean(cps$age)
median(cps$age)
mean(cps$earnwk, na.rm = TRUE)
median(cps$earnwk, na.rm = TRUE)
# histogram of weekly earnings with 92 bins (Freedman-Diaconis), with estimated density overlaid
hist(cpsemployed$earnwk, breaks=92, freq=FALSE, main="", xlab="Weekly earnings")
lines(density(cpsemployed$earnwk), col="blue", lwd=2)
abline(v=mean(cpsemployed$earnwk), col="red", lwd=2)
abline(v=median(cpsemployed$earnwk), col="green", lwd=2)
legend("topright", legend=c("Sample median","Sample mean"), col=c("green","red"), lty=c(1,1), lwd=c(2,2))
mean(sp500$T)
median(sp500$T)
mean(sp500$BAC)
median(sp500$BAC)
# graph-display format (two rows, one column)
par(mfrow = c(2,1))
# histograms of T and BAC returns, with density curves
hist(sp500$T, freq=FALSE, breaks=seq(-0.8,0.8,0.02), main="", xlab="Monthly return (T)")
lines(density(sp500$T), col="blue", lwd=2)
hist(sp500$BAC, freq=FALSE, breaks=seq(-0.8,0.8,0.02), main="", xlab="Monthly return (BAC)")
lines(density(sp500$BAC), col="blue", lwd=2)
# sample quartiles of weekly earnings
quantile(cpsemployed$earnwk, probs = c(0.25,0.50,0.75))
# sample quartiles of weekly earnings
quantile(cpsemployed$earnwk, probs = seq(0.1,0.9,0.1))
# histogram of weekly earnings with 92 bins (Freedman-Diaconis), with estimated density overlaid
hist(cpsemployed$earnwk, breaks=92, freq=FALSE, main="Weekly earnings distribution is right-skewed", xlab="Weekly earnings")
lines(density(cpsemployed$earnwk), col="blue", lwd=2)
abline(v=quantile(cpsemployed$earnwk,0.1), col="red", lwd=2)
abline(v=quantile(cpsemployed$earnwk,0.25), col="orange", lwd=2)
abline(v=median(cpsemployed$earnwk), col="blue", lwd=2)
abline(v=quantile(cpsemployed$earnwk,0.75), col="green", lwd=2)
abline(v=quantile(cpsemployed$earnwk,0.9), col="violet", lwd=2)
legend("topright", legend=c("Sample 10% quantile","Sample 25% quantile","Sample median","Sample 75% quantile","Sample 90% quantile"), col=c("red","orange","blue","green","violet"), lty=c(1,1),lwd=c(2,2))
par(mfrow = c(2,2))
s <- seq(-4,+4,0.01)
plot(s, dnorm(s,0, 1), type="l", col="blue", xlab="",ylab="")
lines(s, dnorm(s,0,1.5), col="black", lty=3)
s <- seq(-4,+8,0.01)
plot(s, dnorm(s,4, 1), type="l", col="blue", xlab="",ylab="")
lines(s, dnorm(s,0,1), col="black", lty=3)
s <- seq(-6,+8,0.01)
plot(s, dnorm(s,4, 1), type="l", col="blue", xlab="",ylab="")
lines(s, dnorm(s,0,2), col="black", lty=3)
s <- seq(-4,+10,0.01)
plot(s, dnorm(s,0,1), type="l", col="blue", xlab="",ylab="")
lines(s, dnorm(s,4,2), col="black", lty=3)
# interquartile range of weekly earnings
IQR(cps$earnwk, na.rm = TRUE)
# interquartile range of weekly earnings
IQR(cps$age)
# graph-display format (one row, two columns)
par(mfrow = c(1,2))
# box plot with whiskers and outliers (the default in R)
boxplot(cpsemployed$earnwk, ylab="Weekly earnings", main="Box plot (whiskers and outliers)")
# box plot with whiskers at min and max values
boxplot(cpsemployed$earnwk, range=0, ylab="Weekly earnings", main="Box plot (whiskers at min/max)")
# graph-display format (one row, two columns)
par(mfrow = c(1,2))
# box plot (whiskers and outliers) for AT&T
boxplot(sp500$T, ylab="Monthly return (T)", main="Box plot (whiskers and outliers)",ylim=c(-0.6,0.8))
# box plot (whiskers and outliers) for Bank of America
boxplot(sp500$BAC, ylab="Monthly return (BAC)", main="Box plot (whiskers and outliers)",ylim=c(-0.6,0.8))
# MAD for AT&T
mean(abs(sp500$T-mean(sp500$T)))
# MAD for Bank of America
mean(abs(sp500$BAC-mean(sp500$BAC)))
# sample variance and sample standard deviation for AT&T
var(sp500$T)
sd(sp500$T)
# sample variance and sample standard deviation for Bank of America
var(sp500$BAC)
sd(sp500$BAC)
cpsunion <- cpsemployed[cpsemployed$unionstatus=="Union",]
cpsnonunion <- cpsemployed[cpsemployed$unionstatus=="Non-union",]
par(mfrow = c(2,1))
hist(cpsunion$earnwk, breaks=seq(0,max(cpsunion$earnwk)+200,200), freq=FALSE, main="Weekly earnings (union)",xlim=c(0,6000),xlab="Weekly earnings")
lines(density(cpsunion$earnwk), col="blue", lwd=2)
hist(cpsnonunion$earnwk, breaks=seq(0,max(cpsnonunion$earnwk)+200,200), freq=FALSE, main="Weekly earnings (non-union)", xlim=c(0,6000),xlab="Weekly earnings")
lines(density(cpsnonunion$earnwk), col="blue", lwd=2)
cpsmale <- cpsemployed[cpsemployed$gender=="Male",]
cpsfemale <- cpsemployed[cpsemployed$gender=="Female",]
par(mfrow = c(2,1))
hist(cpsmale$earnwk, breaks=seq(0,max(cpsmale$earnwk)+200,200), freq=FALSE, main="Weekly earnings (male)",xlim=c(0,6000),xlab="Weekly earnings")
lines(density(cpsmale$earnwk), col="blue", lwd=2)
hist(cpsfemale$earnwk, breaks=seq(0,max(cpsfemale$earnwk)+200,200), freq=FALSE, main="Weekly earnings (female)", xlim=c(0,6000),xlab="Weekly earnings")
lines(density(cpsfemale$earnwk), col="blue", lwd=2)
# graph-display format (one row, three columns)
par(mfrow = c(1,3))
# histogram with bin widths of one hour
hist(cpsemployed$hrslastwk, breaks=seq(0.5,99.5,1), freq=FALSE, main="1-hour bins, hrslastwk", xlab="Hours worked last week")
# histogram with bin widths of ten hours
hist(cpsemployed$hrslastwk, breaks=seq(-5,105,10), freq=FALSE, main="10-hour bins, hrslastwk", xlab="Hours worked last week")
# box plot with whiskers and outliers
boxplot(cpsemployed$hrslastwk, ylab="Hours worked last week")
par(mfrow = c(2,2))
s <- seq(-2,+10,0.01)
plot(s, dnorm(s,2, 1), type="l", col="blue", xlab="",ylab="",ylim=c(0,1),main=expression(paste(a==0, ", ", b==2)))
lines(s, dnorm(s,4,2), col="red")
legend("topright", legend=c("x","y = 2x"), col=c("blue","red"), lty=c(1,1), inset=0.02)
s <- seq(-2,+10,0.01)
plot(s, dnorm(s,2,1), type="l", col="blue", xlab="",ylab="",ylim=c(0,1),main=expression(paste(a==0, ", ", b==0.5)))
lines(s, dnorm(s,1,0.5), col="red")
legend("topright", legend=c("x","y = 0.5x"), col=c("blue","red"), lty=c(1,1), inset=0.02)
s <- seq(-2,+10,0.01)
plot(s, dnorm(s,2,1), type="l", col="blue", xlab="",ylab="",ylim=c(0,1),main=expression(paste(a==3, ", ", b==1)))
lines(s, dnorm(s,5,1), col="red")
legend("topright", legend=c("x","y = 3+x"), col=c("blue","red"), lty=c(1,1), inset=0.02)
s <- seq(-2,+10,0.01)
plot(s, dnorm(s,2,1), type="l", col="blue", xlab="",ylab="",ylim=c(0,1),main=expression(paste(a==3, ", ", b==0.5)))
lines(s, dnorm(s,4,0.5), col="red")
legend("topright", legend=c("x","y = 3+0.5x"), col=c("blue","red"), lty=c(1,1), inset=0.02)
cps <- read.csv("cps2019.csv", na.strings="", stringsAsFactors=TRUE)
cpsemployed <- subset(cps,cps$lfstatus=="Employed")
nonworkhrs <- 168-cps$hrslastwk
par(mfrow = c(2,1))
# histogram with bin widths of 1 hour
hist(cpsemployed$hrslastwk, breaks=seq(0.5,99.5,1), freq=FALSE, main="", xlab="Hours worked last week")
# histogram with bin widths of 1 hour
hist(nonworkhrs, breaks=seq(68.5,168.5,1), freq=FALSE, main="", xlab="Non-working hours last week")
plot(sp500$T, type="l", ylab="Monthly return (T)")
# create a time-series object for AT&T monthly returns
ts_T <- ts(sp500$T, start=c(1991,1), frequency=12)
plot(ts_T, ylab="Monthly return (T)")
# create the time-series objects for monthly returns
ts_T <- ts(sp500$T, start=c(1991,1), frequency=12)
ts_BAC <- ts(sp500$BAC, start=c(1991,1), frequency=12)
plot(ts_T, ylab="Monthly returns", ylim=c(-0.6,0.6))
lines(ts_BAC, col="blue")
legend("topright", c("T","BAC"), lty=1, col=c("black","blue"))
plot(cbind(ts_T,ts_BAC), main="")
# table of joint sample counts for labor-force status and race
table(cps$lfstatus, cps$race)
addmargins(table(cps$lfstatus, cps$race))
# table of joint sample proportions for labor-force status and race
addmargins(table(cps$lfstatus, cps$race))/nrow(cps)
# table of sample proportions of labor-force status conditional on race
prop.table(table(cps$lfstatus, cps$race), margin=2)
# table of sample proportions of race conditional on labor-force status
prop.table(table(cps$lfstatus, cps$race), margin=1)
# create sample count table for race and labor-force status variables
tbl_racelf <- table(cps$race, cps$lfstatus)
# barplot command --- note that categories on x-axis are based upon columns (lfstatus) of the table
barplot(prop.table(tbl_racelf, margin=1), ylim=c(0,0.8), col=c("blue","dodgerblue","lightblue"),
legend.text=rownames(tbl_racelf), beside=TRUE, main="")
# create sample count table table for race and labor-force status variables
tbl_lfrace <- table(cps$lfstatus, cps$race)
# barplot command --- categories on x-axis are based upon columns (race) of the table
barplot(prop.table(tbl_lfrace, margin=1), ylim=c(0,0.8), col=c("blue","dodgerblue","lightblue"),
legend.text=rownames(tbl_lfrace), beside=TRUE,
args.legend = list(x="topleft",inset=0.01), main="")
# box plot of weekly earnings, by race
boxplot(cps$earnwk~cps$race, ylab="Weekly earnings", xlab="Race")
# create weekly earnings vectors for the three subsamples, by race
earnwk_black <- cpsemployed[cpsemployed$race=="Black","earnwk"]
earnwk_other <- cpsemployed[cpsemployed$race=="Other","earnwk"]
earnwk_white <- cpsemployed[cpsemployed$race=="White","earnwk"]
# first plot the density of weekly earnings for black individuals
plot(density(earnwk_black), main="", xlab="Weekly earnings")
# overlay the density of weekly earnings for other-race individuals
lines(density(earnwk_other), lty=2)
# overlay the density of weekly earnings for white individuals
lines(density(earnwk_white), lty=3)
# draw the legend
legend("topright", legend=c("Black","Other","White"), lty=c(1,2,3), inset=0.01)
# scatter plot of weekly earnings versus years of education
plot(cps$educ, cps$earnwk, xlab="Education", ylab="Weekly earnings")
# expanded scatter plot, with weekly earnings, years of education, and weekly hours worked
plot(cps[,c("educ","earnwk","hrslastwk")])
# scatter plot of Lowe's returns versus Home Depot returns
plot(sp500$HD, sp500$LOW, xlab="Monthly returns (HD)", ylab="Monthly returns (LOW)")
# expanded scatter plot of monthly returns
plot(sp500[,c("HD","LOW","BAC","WFC")])
tempx <- c(4,3,8,12,0,10,5)
tempy <- c(8,6,10,1,15,3,6)
plot(tempx,tempy,xlab="x",ylab="y")
cov(sp500$HD, sp500$LOW)
# scatter plot with horizontal/vertical lines drawn at the sample means
plot(sp500$HD, sp500$LOW, main="", xlab="Monthly returns (HD)", ylab="Monthly returns (LOW)")
abline(h=mean(sp500$LOW))
abline(v=mean(sp500$HD))
cov(cpsemployed$educ, cpsemployed$earnwk)
# scatter plot with horizontal/vertical lines drawn at the sample means
plot(cpsemployed$educ, cpsemployed$earnwk, main="", xlab="Education", ylab="Weekly earnings")
abline(h=mean(cpsemployed$earnwk))
abline(v=mean(cpsemployed$educ))
set.seed(1234)
x <- rnorm(100)
y5 <- rnorm(100)
y1 <- 0.4*x + (1-(0.4)^2)*rnorm(100)
y2 <- 0.8*x + (1-(0.8)^2)*rnorm(100)
y3 <- -0.4*x + (1-(0.4)^2)*rnorm(100)
y4 <- -0.8*x + (1-(0.8)^2)*rnorm(100)
y6 <- x
par(mfrow=c(2,3))
plot(y1,x,main=r[xy]~" = +0.4",ylab="y",xlab="x")
plot(y2,x,main=r[xy]~" = +0.8",ylab="y",xlab="x")
plot(y6,x,main=r[xy]~" = +1",ylab="y",xlab="x")
plot(y3,x,main=r[xy]~" = -0.4",ylab="y",xlab="x")
plot(y4,x,main=r[xy]~" = -0.8",ylab="y",xlab="x")
plot(y5,x,main=r[xy]~" = 0",ylab="y",xlab="x")
temp <- read.csv("parabola-zero-correlation.csv")
par(mfrow=c(1,2))
plot(temp[,1],temp[,2],xlab="x",ylab="y")
plot(temp[,1],temp[,2],xlab="x",ylab="y")
abline(h=mean(temp[,2]),lty=3)
abline(v=mean(temp[,1]),lty=3)
cor(cpsemployed$educ, cpsemployed$earnwk)
cor(cpsemployed$hrslastwk, cpsemployed$earnwk)
cor(sp500$HD, sp500$LOW)
cor(sp500[,c("HD","LOW","BAC","WFC","MRO","COP")])
# sample mean of weekly earnings for union employees
mean(cpsemployed[cpsemployed$unionstatus=="Union","earnwk"])
# sample mean of weekly earnings for non-union employees
mean(cpsemployed[cpsemployed$unionstatus=="Non-union","earnwk"])
# sample correlation between union indicator and weekly earnings
union_var <- ifelse(cpsemployed$unionstatus=="Union",1,0)
cor(union_var, cpsemployed$earnwk)
set.seed(1234)
par(mfrow=c(3,2))
x<-rnorm(200)
y1 <- 0.85*x + (1-(0.85)^2)*rnorm(200)
y2 <- -0.85*x + (1-(0.85)^2)*rnorm(200)
y3 <- rnorm(200)
plot(x,y1,xlim=c(-3,3),ylim=c(-3,3),xlab="x",ylab="y",main="Variation in "~x+y~" for positive"~r[xy])
abline(a=-3,b=-1,lty=3)
abline(a=-2,b=-1,lty=3)
abline(a=-1,b=-1,lty=3)
abline(a=0,b=-1,lty=3)
abline(a=1,b=-1,lty=3)
abline(a=2,b=-1,lty=3)
abline(a=3,b=-1,lty=3)
hist(x+y1,breaks=seq(-7.5,7.5,0.5),freq=FALSE,main="Distribution of "~x+y,ylim=c(0,1),xlab="")
axis(side=1,at=seq(-6,6,1),labels=seq(-6,6,1))
plot(x,y3,xlim=c(-3,3),ylim=c(-3,3),xlab="x",ylab="y",main="Variation in"~x+y~" for "~r[xy]~"= 0")
abline(a=-3,b=-1,lty=3)
abline(a=-2,b=-1,lty=3)
abline(a=-1,b=-1,lty=3)
abline(a=0,b=-1,lty=3)
abline(a=1,b=-1,lty=3)
abline(a=2,b=-1,lty=3)
abline(a=3,b=-1,lty=3)
hist(x+y3,breaks=seq(-7.5,7.5,0.5),freq=FALSE,main="Distribution of "~x+y,ylim=c(0,1),xlab="")
axis(side=1,at=seq(-6,6,1),labels=seq(-6,6,1))
plot(x,y2,xlim=c(-3,3),ylim=c(-3,3),xlab="x",ylab="y",main="Variation in"~x+y~" for negative"~r[xy])
abline(a=-3,b=-1,lty=3)
abline(a=-2,b=-1,lty=3)
abline(a=-1,b=-1,lty=3)
abline(a=0,b=-1,lty=3)
abline(a=1,b=-1,lty=3)
abline(a=2,b=-1,lty=3)
abline(a=3,b=-1,lty=3)
hist(x+y2,breaks=seq(-7.5,7.5,0.5),freq=FALSE,main="Distribution of "~x+y,ylim=c(0,1),xlab="")
axis(side=1,at=seq(-6,6,1),labels=seq(-6,6,1))
# graph-display format (two rows, one column)
par(mfrow=c(2,1))
# create vectors of length nine
mean_bac_wfc <- rep(0,9)
sd_bac_wfc <- rep(0,9)
mean_cop_wfc <- rep(0,9)
sd_cop_wfc <- rep(0,9)
# for loop with nine iterations (nine different weights)
for (i in 1:9) {
mean_bac_wfc[i] <- mean(0.1*i*sp500$BAC + (1-0.1*i)*sp500$WFC)
sd_bac_wfc[i] <- sd(0.1*i*sp500$BAC + (1-0.1*i)*sp500$WFC)
mean_cop_wfc[i] <- mean(0.1*i*sp500$COP + (1-0.1*i)*sp500$WFC)
sd_cop_wfc[i] <- sd(0.1*i*sp500$COP + (1-0.1*i)*sp500$WFC)
}
# plot standard deviation vs mean for BAC-WFC portolios
plot(mean_bac_wfc, sd_bac_wfc, ylim=c(0.07,0.11), ylab="Standard deviation",
xlab="Mean", main="BAC-WFC Portfolios for Different Weights")
# add labels showing the weights above the points
for (i in 1:9) { text(mean_bac_wfc[i],sd_bac_wfc[i]+0.005,as.character(0.1*i),cex=0.5) }
# plot standard deviation vs mean for COP-WFC portolios
plot(mean_cop_wfc, sd_cop_wfc, ylim=c(0.06,0.10), ylab="Standard deviation",
xlab="Mean", main="COP-WFC Portfolios for Different Weights")
# add labels showing the weights above the points
for (i in 1:9) { text(mean_cop_wfc[i],sd_cop_wfc[i]+0.005,as.character(0.1*i),cex=0.5) }
par(mfrow=c(2,2))
plot(0:1,c(0.5,0.5),type="h",axes=FALSE,main="pmf for fair coin toss",xlab=expression(paste(x[k],"*")),ylab="",xlim=c(0,1),ylim=c(0,1))
title(ylab=expression(paste("Probability ",p[X],"(",x[k],"*",")")), line=2.5)
axis(1,at=c(0,1))
axis(2,at=seq(0,1,0.1))
plot(1:6,rep(1/6,6),type="h",axes=FALSE,main="pmf for fair die roll",xlab=expression(paste(x[k],"*")),ylab="",xlim=c(1,6),ylim=c(0,1))
title(ylab=expression(paste("Probability ",p[X],"(",x[k],"*",")")), line=2.5)
axis(1,at=1:6)
axis(2,at=seq(0,1,0.1))
plot(0:3,c(0.512,0.384,0.096,0.008),type="h",axes=FALSE,main="pmf for # of purchases",xlab=expression(paste(x[k],"*")),ylab="",xlim=c(0,3),ylim=c(0,1))
title(ylab=expression(paste("Probability ",p[X],"(",x[k],"*",")")), line=2.5)
axis(1,at=0:3)
axis(2,at=seq(0,1,0.1))
temp <- 1:20
probtemp <- 0.2*(0.8^(temp-1))
plot(1:20,probtemp,type="h",axes=FALSE,main="pmf for # visitors until purchase",xlab=expression(paste(x[k],"*")),ylab="",xlim=c(1,20),ylim=c(0,1))
title(ylab=expression(paste("Probability ",p[X],"(",x[k],"*",")")), line=2.5)
axis(1,at=1:20)
axis(2,at=seq(0,1,0.1))
par(mfrow=c(1,1))
plot(-1:2,c(0,0.5,1,1),type="s",axes=FALSE,lty=3,main="",xlab=expression(paste(x[k],"*")),ylab="",xlim=c(-1,2),ylim=c(0,1))
title(ylab=expression(paste(F[X],"(",x[k],"*",")")), line=2.5)
axis(1,at=c(0,1))
axis(2,at=seq(0,1,0.1))
points(0,0,pch=1)
points(0,0.5,pch=16)
points(1,0.5,pch=1)
points(1,1,pch=16)
segments(-1,0,0,0)
segments(0,0.5,1,0.5)
segments(1,1,2,1)
par(mfrow=c(1,1))
plot(0:7,c(0,1/6,2/6,3/6,4/6,5/6,1,1),type="s",axes=FALSE,main="",xlab=expression(paste(x[k],"*")),ylab="",xlim=c(0,7),ylim=c(0,1),lty=3)
title(ylab=expression(paste(F[X],"(",x[k],"*",")")), line=2.5)
axis(1,at=1:6)
axis(2,at=seq(0,1,0.1))
points(1,0,pch=1)
points(1,1/6,pch=16)
points(2,1/6,pch=1)
points(2,2/6,pch=16)
points(3,2/6,pch=1)
points(3,3/6,pch=16)
points(4,3/6,pch=1)
points(4,4/6,pch=16)
points(5,4/6,pch=1)
points(5,5/6,pch=16)
points(6,5/6,pch=1)
points(6,1,pch=16)
segments(0,0,1,0)
segments(1,1/6,2,1/6)
segments(2,2/6,3,2/6)
segments(3,3/6,4,3/6)
segments(4,4/6,5,4/6)
segments(5,5/6,6,5/6)
segments(6,1,7,1)
par(mfrow=c(1,1))
plot(-1:4,c(0,0.512,0.896,0.992,1,1),type="s",axes=FALSE,main="",xlab=expression(paste(x[k],"*")),ylab="",xlim=c(-1,4),ylim=c(0,1),lty=3)
title(ylab=expression(paste(F[X],"(",x[k],"*",")")), line=2.5)
axis(1,at=0:3)
axis(2,at=seq(0,1,0.1))
points(0,0,pch=1)
points(0,0.512,pch=16)
points(1,0.512,pch=1)
points(1,0.896,pch=16)
points(2,0.896,pch=1)
points(2,0.992,pch=16)
points(3,0.992,pch=1)
points(3,1,pch=16)
segments(-1,0,0,0)
segments(0,0.512,1,0.512)
segments(1,0.896,2,0.896)
segments(2,0.992,3,0.992)
segments(3,1,4,1)
set.seed(1234)
# initialize the number of simulations
num_simulations <- 5000
# roll a fair six-sided die for all simulations
dierolls <- sample(1:6, num_simulations, replace=TRUE)
# graph-display format (two rows, two columns)
par(mfrow=c(2,2))
# plot the sample frequency of a "6" being rolled
plot(cumsum(dierolls==6)/(1:num_simulations), main="Sample proportion of 6's",
xlab="", ylab="", ylim=c(0,1), cex=0.5)
abline(h=1/6,col="red")
# plot the sample mean of die rolls
plot(cumsum(dierolls)/(1:num_simulations), main="Sample mean of die rolls",
xlab="", ylab="", ylim=c(1,6), cex=0.5)
abline(h=3.5,col="red")
# plot the sample variance of die rolls
tempvar <- rep(0,num_simulations)
for (i in 2:num_simulations) {
tempavg <- cumsum(dierolls)[i]/i
tempvar[i] <- sum((dierolls[1:i]-tempavg)^2)/(i-1)
}
plot(tempvar[2:num_simulations], main="Sample variance of die rolls",
xlab="", ylab="", ylim=c(2,4), cex=0.5)
abline(h=35/12,col="red")
# plot the sample standard deviation of die rolls
plot(sqrt(tempvar[2:num_simulations]), main="Sample standard deviation of die rolls",
xlab="", ylab="", ylim=c(1.5,2.5), cex=0.5)
abline(h=sqrt(35/12),col="red")
dbinom(3,10,0.2)
dbinom(4,10,0.2)
dbinom(0:10,10,0.2)
pbinom(0:10,10,0.2)
set.seed(1234)
rbinom(50,10,0.2)
plot(0:20, dbinom(0:20,20,0.6), type="h", axes=FALSE, main="pmf for # of days that stock goes up",
xlab="", ylab="Probability", xlim=c(0,20), ylim=c(0,0.2))
axis(1, at=0:20)
axis(2, at=seq(0,0.2,0.02))
par(mfrow=c(2,2))
plot(0:10,dbinom(0:10,10,0.1),type="h",main="Binomial(10,0.1) pmf",xlab="",ylab="Probability")
plot(0:10,dbinom(0:10,10,0.2),type="h",main="Binomial(10,0.2) pmf",xlab="",ylab="Probability")
plot(0:10,dbinom(0:10,10,0.5),type="h",main="Binomial(10,0.5) pmf",xlab="",ylab="Probability")
plot(0:100,dbinom(0:100,100,0.1),type="h",main="Binomial(100,0.1) pmf",xlab="",ylab="Probability")
1-pbinom(100,200,0.52)
1-pbinom(500,1000,0.52)
par(mfrow=c(2,1))
plot((0:200)/200,dbinom(0:200,200,0.52)/200,type="h",main="pmf for P (200-voter poll)",xlab="",ylab="Probability")
abline(v=0.5,col="red",lwd=2)
abline(v=0.52,col="blue",lwd=2)
plot((0:1000)/1000,dbinom(0:1000,1000,0.52)/1000,type="h",main="pmf for P (1000-voter poll)",xlab="",ylab="Probability")
abline(v=0.5,col="red",lwd=2)
abline(v=0.52,col="blue",lwd=2)
dgeom(0:10,0.2)
pgeom(0:10,0.2)
# graph-display format (two rows, two columns)
par(mfrow=c(2,2))
# plots of four different geometric pmf's
plot(0:20, dgeom(0:20,0.1), type="h", main="Geo(0.1) pmf", xlab="", ylab="Probability", ylim=c(0,0.8))
plot(0:20, dgeom(0:20,0.2), type="h", main="Geo(0.2) pmf", xlab="", ylab="Probability", ylim=c(0,0.8))
plot(0:20, dgeom(0:20,0.5), type="h", main="Geo(0.5) pmf", xlab="", ylab="Probability", ylim=c(0,0.8))
plot(0:20, dgeom(0:20,0.8), type="h", main="Geo(0.8) pmf", xlab="", ylab="Probability", ylim=c(0,0.8))
dnbinom(0:2,3,0.2)
# graph-display format (three rows, one column)
par(mfrow=c(3,1))
# plots of three different negative binomial pmf's
plot(0:50, dnbinom(0:50,3,0.2), type="h", main="NegBin(3,0.2) pmf", xlab="", ylab="Probability", ylim=c(0,0.06))
plot(0:50, dnbinom(0:50,3,0.1), type="h", main="NegBin(3,0.1) pmf", xlab="", ylab="Probability", ylim=c(0,0.06))
plot(0:50, dnbinom(0:50,4,0.2), type="h", main="NegBin(4,0.2) pmf", xlab="", ylab="Probability", ylim=c(0,0.06))
# graph-display format (two rows, one column)
par(mfrow=c(2,1))
# plots of two different Poisson pmf's
plot(0:10, dpois(0:10,2), type="h", main="Poisson(2) pmf", xlab="Number of patent applications", ylab="Probability")
plot(0:10, dpois(0:10,4), type="h", main="Poisson(4) pmf", xlab="Number of patent applications", ylab="Probability")
plot(0:40, dpois(0:40,20), type="h", main="", xlab="Customers between 10am and 11am", ylab="Probability")
plot(0:6, dpois(0:6,1/3), type="h", main="", xlab="Customers during a one-minute interval", ylab="Probability")
par(mfrow=c(2,1))
x_beta <- seq(0,1,by=0.01)
y_beta <- dbeta(x_beta,shape1=2,shape2=7)
plot(x_beta,y_beta,type="l",yaxs="i",ylim=c(0,3.5),ylab="",xlab="",xaxt="n",yaxt="n",main="pdf example")
lines(x=c(0.3,0.3),y=c(0,dbeta(0.3,shape1=2,shape2=7)),lty=3)
lines(x=c(0.5,0.5),y=c(0,dbeta(0.5,shape1=2,shape2=7)),lty=3)
mtext("a",side=1,at=0.3)
mtext("b",side=1,at=0.5)
mtext(expression(f[X](x)),side=2,at=3.5)
plot(x_beta,y_beta,type="l",yaxs="i",ylim=c(0,3.5),ylab="",xlab="",xaxt="n",yaxt="n",main="pdf example with interval probability")
xshade <- seq(0.3,0.5,length=100)
yshade <- dbeta(xshade,shape1=2,shape2=7)
polygon(c(0.3,xshade,0.5),c(0,yshade,0),col="blue")
mtext("a",side=1,at=0.3)
mtext("b",side=1,at=0.5)
mtext(expression(f[X](x)),side=2,at=3.5)
temp <- seq(-1,2,0.01)
plot(temp,dunif(temp),type='l',main="",ylab=expression(f[X](x)),xlab="x")
temp <- seq(3,12,0.01)
plot(temp,dunif(temp,5,10),type='l',main="",ylab=expression(f[X](x)),xlab="x")
dunif(4,5,10)
dunif(6,5,10)
dunif(8,5,10)
set.seed(1234)
runif(20,5,10)
# define a function that returns the pdf of a triangular distribution
tri_pdf <- function(x) {
return( (x>=0)*(x<=1)*x + (x>1)*(x<=2)*(2-x) )
}
# create a grid of values to evaluate the pdf
temp <- seq(-1,3,0.01)
# graph the pdf
plot(temp, tri_pdf(temp), type='l', main="", ylab=expression(f[X](x)), xlab="x")
par(mfrow=c(3,1))
x_beta <- seq(0,1,by=0.01)
y_beta <- dbeta(x_beta,shape1=2,shape2=7)
plot(x_beta,y_beta,type="l",xaxs="i",yaxs="i",ylim=c(0,3.5),xaxt="n",xlab="",ylab=expression(f[X](x)),main="pdf example")
xshade <- seq(0,0.3,length=100)
yshade <- dbeta(xshade,shape1=2,shape2=7)
polygon(c(0,xshade,0.3),c(0,yshade,0),col="blue")
mtext("a",side=1,at=0.3)
mtext("b",side=1,at=0.4)
plot(x_beta,y_beta,type="l",xaxs="i",yaxs="i",ylim=c(0,3.5),xaxt="n",xlab="",ylab=expression(f[X](x)),main="pdf example")
xshade <- seq(0,0.4,length=100)
yshade <- dbeta(xshade,shape1=2,shape2=7)
polygon(c(0,xshade,0.4),c(0,yshade,0),col="blue")
mtext("a",side=1,at=0.3)
mtext("b",side=1,at=0.4)
y_beta <- pbeta(x_beta,shape1=2,shape2=7)
plot(x_beta,y_beta,type="l",xaxs="i",yaxs="i",xaxt="n",yaxt="n",xlab="",ylab=expression(F[X](x)),main="cdf example")
axis(2,seq(0,1.2,by=0.2),las=2)
lines(x=c(0.3,0.3),y=c(0,pbeta(0.3,shape1=2,shape2=7)),lty=3)
lines(x=c(0,0.3),y=c(pbeta(0.3,shape1=2,shape2=7),pbeta(0.3,shape1=2,shape2=7)),lty=3)
lines(x=c(0.4,0.4),y=c(0,pbeta(0.4,shape1=2,shape2=7)),lty=3)
lines(x=c(0,0.4),y=c(pbeta(0.4,shape1=2,shape2=7),pbeta(0.4,shape1=2,shape2=7)),lty=3)
mtext("a",side=1,at=0.3)
mtext("b",side=1,at=0.4)
temp <- seq(-1,2,0.01)
plot(temp,punif(temp),type='l',main="",ylab=expression(F[X](x)),xlab="x")
punif(0.5)-punif(0.2)
# define a function that returns the cdf of a triangular distribution
tri_cdf <- function(x) {
return( (x>=0)*(x<=1)*(x^2/2) + (x>1)*(x<=2)*(1-((2-x)^2/2)) + (x>2)*1 )
}
# create a grid of values to evaluate the pdf
temp <- seq(-1,3,0.01)
# graph the cdf
plot(temp, tri_cdf(temp), type='l', main="", ylab=expression(F[X](x)), xlab="x")
tri_cdf(1.3)-tri_cdf(0.5)
integrate(dunif,0.2,0.8)
integrate(dunif,-Inf,0.8)
integrate(tri_pdf,0.5,1.3)
par(mfrow=c(1,1))
plot(c(-1,0,0,1,1),c(0,0,0.3,0.8,1),type="l",axes=FALSE,lty=3,main="",xlab="x",ylab=expression(F[X](x)),xlim=c(-1,2),ylim=c(0,1))
axis(1,at=c(0,1))
axis(2,at=seq(0,1,0.1))
points(0,0,pch=1)
points(0,0.3,pch=16)
points(1,0.8,pch=1)
points(1,1,pch=16)
segments(-1,0,0,0)
segments(0,0.3,1,0.8)
segments(1,1,2,1)
par(mfrow=c(2,1))
x_beta <- seq(0,1,by=0.01)
y_beta <- pbeta(x_beta,shape1=2,shape2=7)
plot(x_beta,y_beta,type="l",xaxs="i",yaxs="i",xaxt="n",yaxt="n",ylab=expression(F[X](x)),xlab="x",main="cdf example with quantiles")
axis(2,seq(0,1.2,by=0.1),las=2)
lines(x=c(0,qbeta(0.5,shape1=2,shape2=7)),y=c(0.5,0.5),lty=3)
lines(x=c(qbeta(0.5,shape1=2,shape2=7),qbeta(0.5,shape1=2,shape2=7)),y=c(0,0.5),lty=3)
lines(x=c(0,qbeta(0.8,shape1=2,shape2=7)),y=c(0.8,0.8),lty=3)
lines(x=c(qbeta(0.8,shape1=2,shape2=7),qbeta(0.8,shape1=2,shape2=7)),y=c(0,0.8),lty=3)
mtext(expression(tau[X][","][0.5]),side=1,at=qbeta(0.5,shape1=2,shape2=7))
mtext(expression(tau[X][","][0.8]),side=1,at=qbeta(0.8,shape1=2,shape2=7))
y_beta <- dbeta(x_beta,shape1=2,shape2=7)
plot(x_beta,y_beta,type="l",xaxs="i",yaxs="i",ylim=c(0,3.5),xaxt="n",ylab=expression(f[X](x)),xlab="x",main="corresponding pdf")
lines(x=c(qbeta(0.5,shape1=2,shape2=7),qbeta(0.5,shape1=2,shape2=7)),y=c(0,dbeta(qbeta(0.5,shape1=2,shape2=7),shape1=2,shape2=7)),lty=3)
lines(x=c(qbeta(0.8,shape1=2,shape2=7),qbeta(0.8,shape1=2,shape2=7)),y=c(0,dbeta(qbeta(0.8,shape1=2,shape2=7),shape1=2,shape2=7)),lty=3)
mtext(expression(tau[X][","][0.5]),side=1,at=qbeta(0.5,shape1=2,shape2=7))
mtext(expression(tau[X][","][0.8]),side=1,at=qbeta(0.8,shape1=2,shape2=7))
qunif(0.8)
qunif(c(0.50,0.75),5,10)
# auxiliary function that returns x*f(x) for a U(0,1) random variable
exp_unif <- function(x) {
return( x*dunif(x) )
}
# numerical integration to evaluate E(X) for X~U(0,1)
integrate(exp_unif,-Inf,Inf)
integrate(exp_unif,-Inf,Inf)$value
# store E(X) for X~U(0,1) in a variable
mean_unif <- integrate(exp_unif,-Inf,Inf)$value
# auxiliary function that returns (x-E(X))^2*f(x) for a U(0,1) random variable
var_unif <- function(x) {
return( (x-mean_unif)^2*dunif(x) )
}
# numerical integration to evaluate Var(X) for X~U(0,1)
integrate(var_unif,-Inf,Inf)
# numerical integration to evaluate sd(X) for X~U(0,1)
sqrt(integrate(var_unif,-Inf,Inf)$value)
# auxiliary function that returns x*f(x) for a triangular random variable
exp_tri <- function(x) {
return( x*tri_pdf(x) )
}
# store and print E(X) for X~U(0,1) in a variable
mean_tri <- integrate(exp_tri,-Inf,Inf)$value
print(mean_tri)
# auxiliary function that returns (x-E(X))^2*f(x) for a U(0,1) random variable
var_tri <- function(x) {
return( (x-mean_tri)^2*tri_pdf(x) )
}
# numerical integration to evaluate Var(X) for a triangular random variable
integrate(var_tri,-Inf,Inf)
# numerical integration to evaluate sd(X) for a triangular random variable
sqrt(integrate(var_tri,-Inf,Inf)$value)
par(mfrow=c(1,1))
budget_line <- function(x) { 2*(90-x) }
temp <- seq(30,110,1)
plot(temp,budget_line(temp),type='l',main="",ylab="Y",xlab="X",ylim=c(30,60))
segments(60,40,80,40,lwd=3)
segments(60,50,80,50,lwd=3)
segments(60,40,60,50,lwd=3)
segments(80,40,80,50,lwd=3)
polygon(c(70,65,80,80,70),c(40,50,50,40,40),col="blue")
text(70,38.5,"(70,40)",adj=c(1,0),cex=0.7)
text(65,50.5,"(65,50)",adj=c(0,0),cex=0.7)
set.seed(1234)
# initialize the number of simulations
num_simulations <- 100000
# create simulated draws
days_filming_draws <- runif(num_simulations,min=60,max=80)
days_editing_draws <- runif(num_simulations,min=40,max=50)
# calculate frequency corresponding to costs being over-budget
mean(days_filming_draws + 0.5*days_editing_draws > 90)
library(unifed)
par(mfrow=c(2,2))
unif2_pdf <- function(x) {
temp <- rep(0,length(x))
for (i in 1:length(x)) {
if (x[i]<0)
temp[i]=0
else if (x[i]>2)
temp[i]=0
else
temp[i] = dirwin.hall(x[i],2)
}
return(temp)
}
unif3_pdf <- function(x) {
temp <- rep(0,length(x))
for (i in 1:length(x)) {
if (x[i]<0)
temp[i]=0
else if (x[i]>3)
temp[i]=0
else
temp[i] = dirwin.hall(x[i],3)
}
return(temp)
}
unif10_pdf <- function(x) {
temp <- rep(0,length(x))
for (i in 1:length(x)) {
if (x[i]<0)
temp[i]=0
else if (x[i]>10)
temp[i]=0
else
temp[i] = dirwin.hall(x[i],10)
}
return(temp)
}
x <- seq(-1,2,0.001)
plot(x,dunif(x),type="l",xlab="v",ylab=expression(f[V](v)),main=expression(m==1))
plot(x,unif2_pdf(2*x),type="l",xlab="v",ylab=expression(f[V](v)),main=expression(m==2))
plot(x,unif3_pdf(3*x),type="l",xlab="v",ylab=expression(f[V](v)),main=expression(m==3))
plot(x,unif10_pdf(10*x),type="l",xlab="v",ylab=expression(f[V](v)),main=expression(m==10))
par(mfrow=c(2,2))
x = seq(0.01,5,0.01)
plot(x,dgamma(x,1,1/2),type="l",xlab="v",ylab=expression(f[V](v)),main=expression(m==1))
plot(x,dgamma(3*x,3,1/2),type="l",xlab="v",ylab=expression(f[V](v)),main=expression(m==3))
plot(x,dgamma(10*x,10,1/2),type="l",xlab="v",ylab=expression(f[V](v)),main=expression(m==10))
plot(x,dgamma(20*x,20,1/2),type="l",xlab="v",ylab=expression(f[V](v)),main=expression(m==20))
par(mfrow=c(1,1))
x_normal <- seq(-4,4,by=0.01)
y_normal <- dnorm(x_normal)
plot(x_normal,y_normal,type="l",yaxs="i",ylim=c(0,0.5),ylab=expression(f[X](x)),xlab="x",xaxt="n",yaxt="n")
#axis(1,labels=FALSE,xaxt="n")
mtext(expression(mu),side=1,at=0)
lines(x=c(0,0),y=c(0,0.5),lty=3)
par(mfrow=c(2,1))
x_normal <- seq(-4,6,by=0.01)
y_normal <- dnorm(x_normal)
plot(x_normal,y_normal,type="l",yaxs="i",ylim=c(0,0.5),ylab=expression(f[X](x)),xlab="x",xaxt="n",yaxt="n",main="Shift in location")
lines(x_normal,dnorm(x_normal-1.5),col="blue",lty=3)
axis(1,labels=FALSE,xaxt="n")
mtext(expression(mu),side=1,at=0)
lines(x=c(0,0),y=c(0,0.5),lty=3)
lines(x=c(1.5,1.5),y=c(0,0.5),lty=3,col="blue")
x_normal <- seq(-6,6,by=0.01)
y_normal <- dnorm(x_normal)
plot(x_normal,y_normal,type="l",yaxs="i",ylim=c(0,0.5),ylab=expression(f[X](x)),xlab="x",xaxt="n",yaxt="n",main="Increase in variance")
lines(x_normal,dnorm(x_normal,0,2),col="blue",lty=3)
axis(1,labels=FALSE,xaxt="n")
mtext(expression(mu),side=1,at=0)
lines(x=c(0,0),y=c(0,0.5),lty=3)
par(mfrow=c(2,1))
x_normal <- seq(-4,4,by=0.01)
y_normal <- dnorm(x_normal)
plot(x_normal,y_normal,type="l",yaxs="i",ylim=c(0,0.5),ylab=expression(f[X](x)),xlab="x",xaxt="n",yaxt="n",main="pdf of normal random variable")
axis(1,labels=FALSE,xaxt="n")
mtext(expression(mu),side=1,at=0)
lines(x=c(0,0),y=c(0,0.5),lty=3)
x_normal <- seq(-4,4,by=0.01)
y_normal <- pnorm(x_normal)
plot(x_normal,y_normal,type="l",yaxs="i",ylim=c(0,1),ylab=expression(F[X](x)),xlab="x",xaxt="n",yaxt="n",main="cdf of normal random variable")
axis(1,labels=FALSE,xaxt="n")
axis(2,at=seq(0,1,by=0.1),las=2)
mtext(expression(mu),side=1,at=0)
lines(x=c(0,0),y=c(0,1),lty=3)
lines(x=c(-5,0),y=c(0.5,0.5),lty=3)
set.seed(1234)
dnorm(0)
dnorm(1)
pnorm(1)
rnorm(10)
dnorm(0,mean=0,sd=3)
dnorm(1,mean=0,sd=3)
pnorm(1,mean=0,sd=3)
rnorm(10,mean=0,sd=3)
par(mfrow=c(2,1))
x_normal <- seq(-4,4,by=0.01)
y_normal <- dnorm(x_normal)
plot(x_normal,y_normal,type="l",yaxs="i",ylim=c(0,0.5),ylab=expression(f[X](x)),xlab="x",xaxt="n",yaxt="n",main=expression(paste("probability of being within ",sigma," of the mean ",mu)))
axis(1,labels=FALSE,xaxt="n")
mtext(expression(mu),side=1,at=0)
mtext(expression(mu-sigma),side=1,at=-1)
mtext(expression(mu+sigma),side=1,at=1)
xshade <- seq(-1,1,length=100)
yshade <- dnorm(xshade)
polygon(c(-1,xshade,1),c(0,yshade,0),col="blue")
lines(x=c(0,0),y=c(0,0.5),lty=3)
x_normal <- seq(-4,4,by=0.01)
y_normal <- dnorm(x_normal)
plot(x_normal,y_normal,type="l",yaxs="i",ylim=c(0,0.5),ylab=expression(f[X](x)),xlab="x",xaxt="n",yaxt="n",main=expression(paste("probability of being within 2",sigma," of the mean ",mu)))
axis(1,labels=FALSE,xaxt="n")
mtext(expression(mu),side=1,at=0)
mtext(expression(mu-2*sigma),side=1,at=-2)
mtext(expression(mu+2*sigma),side=1,at=2)
xshade <- seq(-2,2,length=100)
yshade <- dnorm(xshade)
polygon(c(-2,xshade,2),c(0,yshade,0),col="blue")
lines(x=c(0,0),y=c(0,0.5),lty=3)
par(mfrow=c(1,1))
x_normal <- seq(-4,4,by=0.01)
y_normal <- dnorm(x_normal)
plot(x_normal,y_normal,type="l",yaxs="i",ylim=c(0,0.5),xaxt="n",ylab=expression(phi(z)),xlab=expression(z),main="")
axis(1,at=seq(-4,4,by=1),las=1)
lines(x=c(0,0),y=c(0,0.5),lty=3)
lines(x=c(-5,0),y=c(dnorm(0),dnorm(0)),lty=3)
# return the 2.5%, 5%, 95%, and 97.5% quantiles of N(0,1)
qnorm(c(0.025,0.05,0.95,0.975))
# confirm the quantiles and probability intervals using the cdf
pnorm(-1.96)
pnorm(1.96)
pnorm(1.96)-pnorm(-1.96)
pnorm(-1.645)
pnorm(1.645)
pnorm(1.645)-pnorm(-1.645)
qnorm(0.85)
qnorm(0.90)
1-pnorm(-0.875)
qnorm(0.925)
1-pnorm(-2.5/sqrt(17))
1-pnorm(0.04/sqrt(0.00618))
wgtx <- seq(0,1,by=0.01)
portfolio_mean <- wgtx*(0.03)+(1-wgtx)*(0.07)
portfolio_variance <- (wgtx^2)*(0.01^2)+((1-wgtx)^2)*(0.08^2)+2*wgtx*(1-wgtx)*(0.2)*(0.01)*(0.08)
portfolio_stdev <- sqrt(portfolio_variance)
plot(portfolio_mean,portfolio_stdev,type="l",main="",xlab="Population mean of portfolio return",ylab="Population standard deviation of portfolio return")
points(0.05,sqrt(0.001705),pch=16)
points(0.2*0.07+0.8*0.03,sqrt(0.04*0.08*0.08+0.64*0.01*0.01+2*0.2*0.8*0.00016),pch=16)
points(0.8*0.07+0.2*0.03,sqrt(0.64*0.08*0.08+0.04*0.01*0.01+2*0.2*0.8*0.00016),pch=16)
points(0.03,0.01,pch=16)
points(0.07,0.08,pch=16)
text(0.03 + 0.001,0.01 - 0.001,"w=0",cex=0.5)
text(0.05 + 0.0015,sqrt(0.001705),"w=0.5",cex=0.5)
text(0.2*0.07+0.8*0.03 + 0.0015,sqrt(0.04*0.08*0.08+0.64*0.01*0.01+2*0.2*0.8*0.00016),"w=0.2",cex=0.5)
text(0.8*0.07+0.2*0.03 + 0.0015,sqrt(0.64*0.08*0.08+0.04*0.01*0.01+2*0.2*0.8*0.00016),"w=0.8",cex=0.5)
text(0.07 - 0.0013,0.08,"w=1",cex=0.5)
par(mfrow=c(1,1))
x_lnormal <- seq(0.01,10,by=0.01)
y_lnormal <- dlnorm(x_lnormal)
plot(x_lnormal,y_lnormal,type="l",main="",xlab="x",ylab=expression(f[X](x)),xaxt="n",yaxt="n")
# create log-earnings variable
logwage <- log(cpsemployed$earnwk)
# graph-display format (two rows, one column)
par(mfrow=c(2,1))
# histogram of earnings, with normal distribution overlaid
hist(cpsemployed$earnwk, breaks=50, freq=FALSE, main="Distribution of weekly earnings", xlab="weekly earnings")
lines(0:8000, dnorm(0:8000,mean(cpsemployed$earnwk),sd(cpsemployed$earnwk)), col="blue")
# histogram of log-earnings, with normal distribution overlaid
hist(logwage, breaks=50, freq=FALSE, main="Distribution of ln(weekly earnings)", xlab="ln(weekly earnings)")
lines(seq(3,9,by=0.01), dnorm(seq(3,9,by=0.01),mean(logwage),sd(logwage)), col="blue")
qlnorm(c(0.025,0.975), meanlog=6.5, sdlog=0.7)
1-plnorm(1000, meanlog=6.5, sdlog=0.7)
set.seed(1234)
rlnorm(10, meanlog=6.5, sdlog=0.7)
exp(6.5+0.5*(0.7)^2)
exp(6.5+0.5*(0.7)^2)*sqrt(exp(0.7)^2-1)
par(mfrow=c(2,2))
temp <- seq(0.01,20,by=0.01)
plot(temp,dchisq(temp,4),type="l",xlab="x",ylab=expression(f[X](x)),main="chi-square random variable, df=4",ylim=c(0,0.2),xlim=c(0,20))
plot(temp,dchisq(temp,6),type="l",xlab="x",ylab=expression(f[X](x)),main="chi-square random variable, df=6",ylim=c(0,0.2),xlim=c(0,20))
plot(temp,dchisq(temp,8),type="l",xlab="x",ylab=expression(f[X](x)),,main="chi-square random variable, df=8",ylim=c(0,0.2),xlim=c(0,20))
plot(temp,dchisq(temp,10),type="l",xlab="x",ylab=expression(f[X](x)),,main="chi-square random variable, df=10",ylim=c(0,0.2),xlim=c(0,20))
qchisq(0.10,4)
qchisq(0.10,6)
qchisq(0.10,8)
qchisq(0.10,10)
pchisq((1.96)^2,1)
pchisq((1.645)^2,1)
par(mfrow=c(1,3))
temp <- seq(0.01,20,by=0.01)
plot(temp,dexp(temp,1),type="l",xlab="x",ylab=expression(f[X](x)),ylim=c(0,0.8),xlim=c(0,10),main=expression(paste("Exponential RV, ",theta==1)))
plot(temp,dexp(temp,0.5),type="l",xlab="x",ylab=expression(f[X](x)),ylim=c(0,0.8),xlim=c(0,10),main=expression(paste("Exponential RV, ",theta==0.5)))
plot(temp,dexp(temp,0.2),type="l",xlab="x",ylab=expression(f[X](x)),ylim=c(0,0.8),xlim=c(0,10),main=expression(paste("Exponential RV, ",theta==0.2)))
pexp(1,rate=0.5)-pexp(0,rate=0.5)
pexp(2,rate=0.5)-pexp(1,rate=0.5)
pexp(3,rate=0.5)-pexp(2,rate=0.5)
set.seed(1234)
rexp(10,rate=0.5)
temp <- rexp(100000,rate=0.5)
mean(temp)
sd(temp)
par(mfrow=c(2,2))
plot((0:2)/2,dbinom(0:2,2,prob=0.2),type="h",main=expression(n==2),xlab=expression(v),ylab=expression(p[bar(X)](v)),ylim=c(0,1))
plot((0:4)/4,dbinom(0:4,4,prob=0.2),type="h",main=expression(n==4),xlab=expression(v),ylab=expression(p[bar(X)](v)),ylim=c(0,1))
plot((0:10)/10,dbinom(0:10,10,prob=0.2),type="h",main=expression(n==10),xlab=expression(v),ylab=expression(p[bar(X)](v)),ylim=c(0,1))
plot((0:20)/20,dbinom(0:20,20,prob=0.2),type="h",main=expression(n==20),xlab=expression(v),ylab=expression(p[bar(X)](v)),ylim=c(0,1))
library(unifed)
temp <- seq(0.001,0.999,by=0.001)
unif2 <- rep(0,999)
unif3 <- rep(0,999)
unif5 <- rep(0,999)
unif10 <- rep(0,999)
for (i in 1:999) {
unif2[i] <- 2*dirwin.hall(2*temp[i],2)
unif3[i] <- 3*dirwin.hall(3*temp[i],3)
unif5[i] <- 5*dirwin.hall(5*temp[i],5)
unif10[i] <- 10*dirwin.hall(10*temp[i],10)
}
par(mfrow=c(2,2))
plot(temp,unif2,type="l",main=expression(n==2),ylim=c(0,4.3),xlab=expression(v),ylab=expression(f[bar(X)](v)))
plot(temp,unif3,type="l",main=expression(n==3),ylim=c(0,4.3),xlab=expression(v),ylab=expression(f[bar(X)](v)))
plot(temp,unif5,type="l",main=expression(n==5),ylim=c(0,4.3),xlab=expression(v),ylab=expression(f[bar(X)](v)))
plot(temp,unif10,type="l",main=expression(n==10),ylim=c(0,4.3),xlab=expression(v),ylab=expression(f[bar(X)](v)))
set.seed(1234)
# initialize the number of simulations
num_simulations <- 100000
# create a vector of sample averages for n=2
xmean_2 <- replicate(num_simulations, mean(rlnorm(2, meanlog=0, sdlog=1)))
# create a vector of sample averages for n=5
xmean_5 <- replicate(num_simulations, mean(rlnorm(5, meanlog=0, sdlog=1)))
# create a vector of sample averages for n=10
xmean_10 <- replicate(num_simulations, mean(rlnorm(10, meanlog=0, sdlog=1)))
# create a vector of sample averages for n=20
xmean_20 <- replicate(num_simulations, mean(rlnorm(20, meanlog=0, sdlog=1)))
# graph-display format (two rows, two columns)
par(mfrow=c(2,2))
# plot smoothed densities for all four sample sizes
plot(density(xmean_2), main=expression(n==2), xlim=c(0,8),
xlab=expression(v), ylab=expression(f[bar(X)](v)))
plot(density(xmean_5), main=expression(n==5), xlim=c(0,8),
xlab=expression(v), ylab=expression(f[bar(X)](v)))
plot(density(xmean_10), main=expression(n==10), xlim=c(0,8),
xlab=expression(v), ylab=expression(f[bar(X)](v)))
plot(density(xmean_20), main=expression(n==20), xlim=c(0,8),
xlab=expression(v), ylab=expression(f[bar(X)](v)))
qchisq(c(0.025,0.975),9)
pchisq(6.25,9)
set.seed(1234)
# initialize the number of simulations
num_simulations <- 100000
# create a vector of sample variances for n=2
varunif_2 <- replicate(num_simulations, var(runif(2)))
# create a vector of sample variances for n=3
varunif_3 <- replicate(num_simulations, var(runif(3)))
# create a vector of sample variances for n=5
varunif_5 <- replicate(num_simulations, var(runif(5)))
# create a vector of sample variances for n=10
varunif_10 <- replicate(num_simulations, var(runif(10)))
# graph-display format (two rows, two columns)
par(mfrow=c(2,2))
# plot smoothed densities for all four sample sizes
plot(density(varunif_2), xlim=c(0,0.5), ylim=c(0,15), main=expression(n==2),
xlab=expression(v), ylab=expression(f[s[X]^2](v)))
plot(density(varunif_3), xlim=c(0,0.5), ylim=c(0,15), main=expression(n==3),
xlab=expression(v), ylab=expression(f[s[X]^2](v)))
plot(density(varunif_5), xlim=c(0,0.5), ylim=c(0,15), main=expression(n==5),
xlab=expression(v), ylab=expression(f[s[X]^2](v)))
plot(density(varunif_10), xlim=c(0,0.5), ylim=c(0,15), main=expression(n==10),
xlab=expression(v), ylab=expression(f[s[X]^2](v)))
sdunif_2 <- sqrt(varunif_2)
sdunif_3 <- sqrt(varunif_3)
sdunif_5 <- sqrt(varunif_5)
sdunif_10 <- sqrt(varunif_10)
par(mfrow=c(2,2))
plot(density(sdunif_2), xlim=c(0,0.8), ylim=c(0,10), main=expression(n==2),
xlab=expression(v), ylab=expression(f[s[X]](v)))
plot(density(sdunif_3), xlim=c(0,0.8), ylim=c(0,10), main=expression(n==3),
xlab=expression(v),ylab=expression(f[s[X]](v)))
plot(density(sdunif_5), xlim=c(0,0.8), ylim=c(0,10), main=expression(n==5),
xlab=expression(v),ylab=expression(f[s[X]](v)))
plot(density(sdunif_10), xlim=c(0,0.8), ylim=c(0,10), main=expression(n==10),
xlab=expression(v),ylab=expression(f[s[X]](v)))
set.seed(1234)
# initialize the number of simulations
num_simulations <- 100000
# create a vector of sample standard deviations for n=10
sdlogn_10 <- replicate(num_simulations, sd(rlnorm(10)))
# create a vector of sample standard deviations for n=30
sdlogn_30 <- replicate(num_simulations, sd(rlnorm(30)))
# graph-display format (two rows, one column)
par(mfrow=c(2,1))
# plot smoothed densities for the two sample sizes
plot(density(sdlogn_10), main=expression(n==10), xlim=c(0,8),
xlab=expression(v), ylab=expression(f[s[X]](v)))
plot(density(sdlogn_30), main=expression(n==30), xlim=c(0,8),
xlab=expression(v), ylab=expression(f[s[X]](v)))
temp <- seq(-2,4,0.01)
par(mfrow=c(3,1))
plot(temp,5*dnorm(temp)*(pnorm(temp)^4),type="l",main=expression(n==5),xlim=c(-2,4),ylim=c(0,1),xlab=expression(v),ylab=expression(f[max[X]](v)))
plot(temp,10*dnorm(temp)*(pnorm(temp)^9),type="l",main=expression(n==10),xlim=c(-2,4),ylim=c(0,1),xlab=expression(v),ylab=expression(f[max[X]](v)))
plot(temp,30*dnorm(temp)*(pnorm(temp)^29),type="l",main=expression(n==30),xlim=c(-2,4),ylim=c(0,1),xlab=expression(v),ylab=expression(f[max[X]](v)))
set.seed(1234)
# initialize the number of simulations
num_simulations <- 100000
# create a vector of sample medians for n=3
mednorm_3 <- replicate(num_simulations, median(rnorm(3)))
# create a vector of sample medians for n=10
mednorm_10 <- replicate(num_simulations, median(rnorm(10)))
# create a vector of sample medians for n=20
mednorm_20 <- replicate(num_simulations, median(rnorm(20)))
# graph-display format (three rows, one column)
par(mfrow=c(3,1))
# plot smoothed densities for the two sample sizes
# for n=20, plot the sampling dist'n of the sample mean for comparison
plot(density(mednorm_3), ylim=c(0,2), xlim=c(-2,2), main=expression(n==3),
xlab=expression(v), ylab=expression(f[widetilde(X)[0.5]](v)))
plot(density(mednorm_10), ylim=c(0,2), xlim=c(-2,2), main=expression(n==10),
xlab=expression(v), ylab=expression(f[widetilde(X)[0.5]](v)))
plot(density(mednorm_20), ylim=c(0,2), xlim=c(-2,2), main=expression(n==20),
xlab=expression(v), ylab=expression(f[widetilde(X)[0.5]](v)))
lines(seq(-2,2,0.01), dnorm(seq(-2,2,0.01),sd=1/sqrt(20)), lty=3)
pnorm((18.5-20)/(4/sqrt(60)))
par(mfrow=c(2,2))
plot((0:10)/10,dbinom(0:10,10,0.2),type="h",main=expression(n==10),xlab=expression(v),ylab=expression(P(bar(X)==v)),xlim=c(-0.2,1))
lines(seq(-0.2,1,0.01),dnorm(seq(-0.2,1,0.01),mean=.2,sd=sqrt(.2*.8/10))/10,col="blue")
plot((0:20)/20,dbinom(0:20,20,0.2),type="h",main=expression(n==20),xlab=expression(v),ylab=expression(P(bar(X)==v)),xlim=c(-0.2,1))
lines(seq(-0.2,1,0.01),dnorm(seq(-0.2,1,0.01),mean=.2,sd=sqrt(.2*.8/20))/20,col="blue")
plot((0:50)/50,dbinom(0:50,50,0.2),type="h",main=expression(n==50),xlab=expression(v),ylab=expression(P(bar(X)==v)),xlim=c(-0.2,1))
lines(seq(-0.2,1,0.01),dnorm(seq(-0.2,1,0.01),mean=.2,sd=sqrt(.2*.8/50))/50,col="blue")
plot((0:100)/100,dbinom(0:100,100,0.2),type="h",main=expression(n==100),xlab=expression(v),ylab=expression(P(bar(X)==v)),xlim=c(-0.2,1))
lines(seq(-0.2,1,0.01),dnorm(seq(-0.2,1,0.01),mean=.2,sd=sqrt(.2*.8/100))/100,col="blue")
dbinom(10,50,0.2)
pnorm((10.5-10)/sqrt(8))-pnorm((9.5-10)/sqrt(8))
pbinom(54,100,0.40)-pbinom(34,100,0.40)
pnorm((0.455-0.4)/0.0490)-pnorm((0.345-0.4)/0.0490)
1-pnorm(-0.08/sqrt(0.009536))
par(mfrow=c(2,2))
temp <- seq(0.01,400,0.01)
temp2 <- seq(0.01,3,0.01)
plot(temp/9,9*dchisq(temp,9),type="l",main=expression(n==10),xlab=expression(v),ylab=expression(f[s[X]^2](v)),xlim=c(0,3),ylim=c(0,3))
lines(temp2,dnorm(temp2,mean=1,sd=sqrt(2/10)),col="blue",lty=3)
plot(temp/19,19*dchisq(temp,19),type="l",main=expression(n==20),xlab=expression(v),ylab=expression(f[s[X]^2](v)),xlim=c(0,3),ylim=c(0,3))
lines(temp2,dnorm(temp2,mean=1,sd=sqrt(2/20)),col="blue",lty=3)
plot(temp/49,49*dchisq(temp,49),type="l",main=expression(n==50),xlab=expression(v),ylab=expression(f[s[X]^2](v)),xlim=c(0,3),ylim=c(0,3))
lines(temp2,dnorm(temp2,mean=1,sd=sqrt(2/50)),col="blue",lty=3)
plot(temp/99,99*dchisq(temp,99),type="l",main=expression(n==100),xlab=expression(v),ylab=expression(f[s[X]^2](v)),xlim=c(0,3),ylim=c(0,3))
lines(temp2,dnorm(temp2,mean=1,sd=sqrt(2/100)),col="blue",lty=3)
set.seed(1234)
corr_50_00 <- rep(0,100000)
corr_50_40 <- rep(0,100000)
corr_50_80 <- rep(0,100000)
corr_100_00 <- rep(0,100000)
corr_100_40 <- rep(0,100000)
corr_100_80 <- rep(0,100000)
for (i in 1:100000) {
x1 <- rnorm(50)
x2 <- rnorm(50)
x3 <- rnorm(50,0.4*x1,sqrt((1-0.4*0.4)))
x4 <- rnorm(50,0.8*x1,sqrt((1-0.8*0.8)))
x5 <- rnorm(100)
x6 <- rnorm(100)
x7 <- rnorm(100,0.4*x5,sqrt((1-0.4*0.4)))
x8 <- rnorm(100,0.8*x5,sqrt((1-0.8*0.8)))
corr_50_00[i] <- cor(x1,x2)
corr_50_40[i] <- cor(x1,x3)
corr_50_80[i] <- cor(x1,x4)
corr_100_00[i] <- cor(x5,x6)
corr_100_40[i] <- cor(x5,x7)
corr_100_80[i] <- cor(x5,x8)
}
par(mfrow=c(2,3))
plot(density(corr_50_00),main=expression(n==50~", "~rho[XY]==0),xlab=expression(v),ylab=expression(f[r[XY]](v)),xlim=c(-0.4,0.4))
lines(seq(-0.6,0.6,0.01),dnorm(seq(-0.6,0.6,0.01),mean=0,sd=sqrt(1/50)),col="blue",lty=3)
plot(density(corr_50_40),main=expression(n==50~", "~rho[XY]==0.4),xlab=expression(v),ylab=expression(f[r[XY]](v)),xlim=c(0,0.8))
lines(seq(-0.2,1,0.01),dnorm(seq(-0.2,1,0.01),mean=0.4,sd=sqrt((1-0.16)^2/50)),col="blue",lty=3)
plot(density(corr_50_80),main=expression(n==50~", "~rho[XY]==0.8),xlab=expression(v),ylab=expression(f[r[XY]](v)),xlim=c(0.6,1))
lines(seq(0.5,1,0.01),dnorm(seq(0.5,1,0.01),mean=0.8,sd=sqrt((1-0.64)^2/50)),col="blue",lty=3)
plot(density(corr_100_00),main=expression(n==100~", "~rho[XY]==0),xlab=expression(v),ylab=expression(f[r[XY]](v)),xlim=c(-0.4,0.4))
lines(seq(-0.6,0.6,0.01),dnorm(seq(-0.6,0.6,0.01),mean=0,sd=sqrt(1/100)),col="blue",lty=3)
plot(density(corr_100_40),main=expression(n==100~", "~rho[XY]==0.4),xlab=expression(v),ylab=expression(f[r[XY]](v)),xlim=c(0,0.8))
lines(seq(-0.2,1,0.01),dnorm(seq(-0.2,1,0.01),mean=0.4,sd=sqrt((1-0.16)^2/100)),col="blue",lty=3)
plot(density(corr_100_80),main=expression(n==100~", "~rho[XY]==0.8),xlab=expression(v),
ylab=expression(f[r[XY]](v)),xlim=c(0.6,1))
lines(seq(0.5,1,0.01),dnorm(seq(0.5,1,0.01),mean=0.8,sd=sqrt((1-0.64)^2/100)),col="blue",lty=3)
temp <- seq(2.5,5.5,0.01)
par(mfrow=c(2,1))
plot(temp,5000*dnorm(temp)*(pnorm(temp)^4999),type="l",main=expression(n==5000),xlim=c(2.5,5.5),ylim=c(0,2),xlab=expression(v),ylab=expression(f[max[X]](v)))
plot(temp,20000*dnorm(temp)*(pnorm(temp)^19999),type="l",main=expression(n==20000),xlim=c(2.5,5.5),ylim=c(0,2),xlab=expression(v),ylab=expression(f[max[X]](v)))
par(mfrow=c(2,2))
temp <- seq(-4,4,by=0.01)
plot(temp,dt(temp,3),type="l",xlab="v",ylab=expression(f[t[3]](v)),main="df=3",ylim=c(0,0.4))
lines(temp,dnorm(temp),col="blue",lty=3)
plot(temp,dt(temp,5),type="l",xlab="v",ylab=expression(f[t[5]](v)),main="df=5",ylim=c(0,0.4))
lines(temp,dnorm(temp),col="blue",lty=3)
plot(temp,dt(temp,10),type="l",xlab="v",ylab=expression(f[t[10]](v)),main="df=10",ylim=c(0,0.4))
lines(temp,dnorm(temp),col="blue",lty=3)
plot(temp,dt(temp,30),type="l",xlab="v",ylab=expression(f[t[30]](v)),main="df=30",ylim=c(0,0.4))
lines(temp,dnorm(temp),col="blue",lty=3)
qt(c(0.975,0.95),3)
qt(c(0.975,0.95),5)
qt(c(0.975,0.95),10)
qt(c(0.975,0.95),30)
qt(c(0.975,0.95),100)
qt(c(0.975,0.95),500)
par(mfrow=c(2,2))
temp <- seq(-4,4,by=0.01)
y_values <- dt(temp,5)
plot(temp,y_values,type="l",yaxs="i",ylim=c(0,0.42),ylab=expression(f[t[5]](v)),xlab="v",xaxt="n",yaxt="n",main=expression(paste(t[5]," distribution")))
axis(1,labels=FALSE,xaxt="n")
mtext("0",side=1,at=0,cex=0.6)
mtext(expression(-t[5*","*0.025]),side=1,at=qt(0.025,5)-0.15,cex=0.6)
mtext(expression(t[5*","*0.025]),side=1,at=qt(0.975,5)+0.15,cex=0.6)
xshade <- seq(-4,qt(0.025,5),length=100)
yshade <- dt(xshade,5)
polygon(c(-4,xshade,qt(0.025,5)),c(0,yshade,0),col="blue")
xshade <- seq(qt(0.975,5),4,length=100)
yshade <- dt(xshade,5)
polygon(c(qt(0.975,5),xshade,4),c(0,yshade,0),col="blue")
lines(c(0,0),c(0,0.42),lty=3)
temp <- seq(-4,4,by=0.01)
y_values <- dt(temp,30)
plot(temp,y_values,type="l",yaxs="i",ylim=c(0,0.42),ylab=expression(f[t[30]](v)),xlab="v",xaxt="n",yaxt="n",main=expression(paste(t[30]," distribution")))
axis(1,labels=FALSE,xaxt="n")
mtext("0",side=1,at=0,cex=0.6)
mtext(expression(-t[30*","*0.025]),side=1,at=qt(0.025,30)-0.5,cex=0.6)
mtext(expression(t[30*","*0.025]),side=1,at=qt(0.975,30)+0.5,cex=0.6)
xshade <- seq(-4,qt(0.025,5),length=100)
yshade <- dt(xshade,30)
polygon(c(-4,xshade,qt(0.025,5)),c(0,yshade,0),col="blue")
xshade <- seq(qt(0.975,5),4,length=100)
yshade <- dt(xshade,30)
polygon(c(qt(0.975,5),xshade,4),c(0,yshade,0),col="blue")
lines(c(0,0),c(0,0.42),lty=3)
temp <- seq(-4,4,by=0.01)
y_values <- dt(temp,5)
plot(temp,y_values,type="l",yaxs="i",ylim=c(0,0.42),ylab=expression(f[t[5]](v)),xlab="v",xaxt="n",yaxt="n",main=expression(paste(t[5]," distribution")))
axis(1,labels=FALSE,xaxt="n")
mtext("0",side=1,at=0,cex=0.6)
mtext(expression(-t[5*","*0.05]),side=1,at=qt(0.05,5)-0.15,cex=0.6)
mtext(expression(t[5*","*0.05]),side=1,at=qt(0.95,5)+0.15,cex=0.6)
xshade <- seq(-4,qt(0.05,5),length=100)
yshade <- dt(xshade,5)
polygon(c(-4,xshade,qt(0.05,5)),c(0,yshade,0),col="blue")
xshade <- seq(qt(0.95,5),4,length=100)
yshade <- dt(xshade,5)
polygon(c(qt(0.95,5),xshade,4),c(0,yshade,0),col="blue")
lines(c(0,0),c(0,0.42),lty=3)
temp <- seq(-4,4,by=0.01)
y_values <- dt(temp,30)
plot(temp,y_values,type="l",yaxs="i",ylim=c(0,0.42),ylab=expression(f[t[30]](v)),xlab="v",xaxt="n",yaxt="n",main=expression(paste(t[30]," distribution")))
axis(1,labels=FALSE,xaxt="n")
mtext("0",side=1,at=0,cex=0.6)
mtext(expression(-t[30*","*0.05]),side=1,at=qt(0.05,30)-0.5,cex=0.6)
mtext(expression(t[30*","*0.05]),side=1,at=qt(0.95,30)+0.5,cex=0.6)
xshade <- seq(-4,qt(0.05,5),length=100)
yshade <- dt(xshade,30)
polygon(c(-4,xshade,qt(0.05,5)),c(0,yshade,0),col="blue")
xshade <- seq(qt(0.95,5),4,length=100)
yshade <- dt(xshade,30)
polygon(c(qt(0.95,5),xshade,4),c(0,yshade,0),col="blue")
lines(c(0,0),c(0,0.42),lty=3)
qt(0.90,19)
set.seed(1234)
mu_x <- 10
sigma_x <- 2
nobs <- 8
r <- 100
CIlower <- numeric(r)
CIupper <- numeric(r)
mu_notinCI <- logical(r)
for (j in 1:r) {
sample <- rnorm(nobs,mean=mu_x,sd=sigma_x)
CIlower[j] <- mean(sample)-qt(0.975,nobs-1)*(sd(sample)/sqrt(nobs))
CIupper[j] <- mean(sample)+qt(0.975,nobs-1)*(sd(sample)/sqrt(nobs))
mu_notinCI[j] <- ((mu_x<CIlower[j]) | (mu_x>CIupper[j]))
}
color <- rep(gray(0.7),r)
color[which(mu_notinCI[1:r])] <- "black"
plot(0, xlim=c(mu_x-6*sigma_x/sqrt(nobs),mu_x+6*sigma_x/sqrt(nobs)),ylim=c(1,r),ylab="Simulation number",xlab="",main="")
abline(v=mu_x,lty=2)
for (j in 1:r) {
lines(c(CIlower[j],CIupper[j]),c(j,j),col=color[j],lwd=2)
}
# function to calculate the se of the sample mean for a vector
se_meanx <- function (x, na.rm = FALSE) {
if (na.rm) {
x <- x[!is.na(x)]
}
return(sd(x)/sqrt(length(x)))
}
par(mfrow=c(1,1))
plot(c(0.20,0.22,0.15),c(3,2,1),xlim=c(0,0.4),ylim=c(0,4),pch=16,yaxt="n",ann=FALSE,cex=1.2)
lines(c(0.155,0.245),c(3,3),lwd=2)
text(0.30,3,expression("e-mail A ("*pi[A]*")"),cex=0.8)
lines(c(0.173,0.267),c(2,2),lwd=2)
text(0.32,2,expression("e-mail B ("*pi[B]*")"),cex=0.8)
lines(c(0.136,0.164),c(1,1),lwd=2)
text(0.22,1,expression("no e-mail ("*pi[C]*")"),cex=0.8)
# create union indicator variable from unionstatus factor variable
cps$union <- 1*(cps$unionstatus=="Union")
# loop through variables of interest
for (varname in c("age","educ","ownchild","earnwk","union")) {
# sample size is number of non-missing observations
nobs <- sum(!is.na(cps[,varname]))
# calculate sample mean of variable
mean_var <- mean(cps[,varname], na.rm=TRUE)
# calculate standard error of sample mean
se_mean <- se_meanx(cps[,varname], na.rm=TRUE)
# output results
print(paste(varname,":","n",nobs,"Mean",signif(mean_var,digits=5),"SE(mean)",signif(se_mean,digits=5)))
}
# function to calculate the se of the sample stdev for a vector
se_sx <- function (x, na.rm = FALSE) {
if (na.rm) {
x <- x[!is.na(x)]
}
nobs <- length(x)
return(sqrt(mean((x-mean(x))^4) - sd(x)^4)/(sqrt(nobs) * sd(x)))
}
# determine the sample size
nobs <- nrow(sp500)
# loop through the stocks of interest
for (stock in c("HD","LOW","BAC","WFC","MRO","COP")) {
returns <- sp500[,stock]
# calculate the stdev of returns
sx <- sd(returns)
# calculate the se of the stdev of returns
stderror_sx <- se_sx(returns)
# output the stock name, along with stdev and se(stdev)
print(paste(stock,":","sx",round(sx,4),"se_sx",round(stderror_sx,5)))
}
# function to calculate the se of the sample stdev for a vector
se_rxy <- function (x, y, na.rm = FALSE) {
# if na.rm is TRUE, keep only observations where both are non-missing
if (na.rm) {
nonmissing <- (!is.na(x) & !is.na(y))
x <- x[nonmissing]
y <- y[nonmissing]
}
nobs <- length(x)
return((1-cor(x,y)^2)/sqrt(nobs))
}
se_rxy(cps$educ, cps$earnwk, na.rm = TRUE)
# determine the sample size
nobs <- nrow(sp500)
# initialize variables with stock names and number of stocks
stocks <- c("HD","LOW","BAC","WFC","MRO","COP")
num_stocks <- length(stocks)
# double loop to loop through possible pairs of stocks
for (i in 1:(num_stocks-1)) {
for (j in (i+1):num_stocks) {
# calculate sample correlation
rxy <- cor(sp500[,stocks[i]],sp500[,stocks[j]])
# calculate standard error of sample correlation
stderr_rxy <- se_rxy(sp500[,stocks[i]],sp500[,stocks[j]])
# output stock-pair names, along with corr and se(corr)
print(paste("Pair",stocks[i],stocks[j],": rxy",round(rxy,3),"se(rxy)",round(stderr_rxy,3)))
}
}
# sample mean of exam-score difference
mean(exams$exam1-exams$exam2)
# sample stdev of exam-score difference
sd(exams$exam1-exams$exam2)
# std error of sample mean of exam-score difference
se_meanx(exams$exam1-exams$exam2)
# calculate means based upon table of counts
meanA <- (44+16)/200
meanB <- (30+16)/200
meandiff <- meanA-meanB
# calculate stdev of linkA-linkB using sample proportion method
sd_1 <- sqrt( (200/199) * ((110/200)*((0-0)-meandiff)^2 + (30/200)*((0-1)-meandiff)^2
+ (44/200)*((1-0)-meandiff)^2 + (16/200)*((1-1)-meandiff)^2) )
print(paste("SD of linkA-linkB from sample proportion method:",round(sd_1,5)))
# create a data frame based upon the table of counts
links <- data.frame(linkA = c(rep(0,110),rep(0,30),rep(1,44),rep(1,16)),
linkB = c(rep(0,110),rep(1,30),rep(0,44),rep(1,16)))
# calculate stdev of linkA-linkB using the sample
sd_2 <- sd(links$linkA - links$linkB)
print(paste("SD of linkA-linkB from sample stdev method:",round(sd_2,5)))
# calculate std errors of the sample averages of union and non-union wages
se_union <- se_meanx(cps[cps$unionstatus=="Union","earnwk"], na.rm=TRUE)
se_nonunion <- se_meanx(cps[cps$unionstatus=="Non-union","earnwk"], na.rm=TRUE)
se_union
se_nonunion
# calculate std error of the difference in sample averages
sqrt(se_union^2 + se_nonunion^2)
# calculate std errors of the stdevs of earnings
se_sx_union <- se_sx(cps[cps$unionstatus=="Union","earnwk"], na.rm=TRUE)
se_sx_nonunion <- se_sx(cps[cps$unionstatus=="Non-union","earnwk"], na.rm=TRUE)
se_sx_union
se_sx_nonunion
# calculate std error of difference in stdevs of earnings
sqrt(se_sx_union^2 + se_sx_nonunion^2)
cor(cps[cps$unionstatus=="Union","earnwk"],cps[cps$unionstatus=="Union","educ"], use="complete.obs")
cor(cps[cps$unionstatus=="Non-union","earnwk"],cps[cps$unionstatus=="Non-union","educ"], use="complete.obs")
se_rxy_union <- se_rxy(cps[cps$unionstatus=="Union","earnwk"],cps[cps$unionstatus=="Union","educ"], na.rm=TRUE)
se_rxy_nonunion <- se_rxy(cps[cps$unionstatus=="Non-union","earnwk"],cps[cps$unionstatus=="Non-union","educ"], na.rm=TRUE)
se_rxy_union
se_rxy_nonunion
sqrt(se_rxy_union^2 + se_rxy_nonunion^2)
set.seed(1234)
# create a data frame with the original data
df <- data.frame(x = c(4,3,8,12,0,10,5), y = c(8,6,10,1,15,3,6))
# determine the number of observations
nobs <- nrow(df)
# create a vector of index values by sampling with replacement
bs_index <- sample(1:nobs, nobs, replace = TRUE)
bs_index
# create a data frame corresponding for the bootstrap sample
bs_df <- df[bs_index,]
bs_df
print(paste("Means for bootstrap sample: x", round(mean(bs_df$x),2), ", y", round(mean(bs_df$y),2)))
print(paste("Medians for bootstrap sample: x", median(bs_df$x), ", y", median(bs_df$y)))
print(paste("Stdevs for bootstrap sample: x", round(sd(bs_df$x),2), ", y", round(sd(bs_df$y),2)))
print(paste("Correlation for bootstrap sample:", round(cor(bs_df$x,bs_df$y),3)))
set.seed(1234)
# create a data frame with the original data
df <- data.frame(x = c(4,3,8,12,0,10,5), y = c(8,6,10,1,15,3,6))
# determine the number of observations
nobs <- nrow(df)
# initialize the number of bootstrap replications
B <- 1000
# initialize vectors to hold bootstrap statistics
bs_meanx <- rep(0,B)
bs_meany <- rep(0,B)
bs_sx <- rep(0,B)
bs_rxy <- rep(0,B)
# loop B times to create bootstrap samples and calculate bootstrap statistics
for (i in 1:B) {
# create a vector of index values by sampling with replacement
bs_index <- sample(1:nobs, nobs, replace = TRUE)
# create a data frame corresponding for the bootstrap sample
bs_df <- df[bs_index,]
# calculate the bootstrap statistics
bs_meanx[i] <- mean(bs_df$x)
bs_meany[i] <- mean(bs_df$y)
bs_sx[i] <- sd(bs_df$x)
bs_rxy[i] <- cor(bs_df$x,bs_df$y)
}
# graph-display format (two rows, two columns)
par(mfrow=c(2,2))
hist(bs_meanx, freq=FALSE, xlab="", ylab="", main=expression(paste("Sample mean of ",x)),
xlim=c(2,10), yaxt='n', breaks=20)
lines(density(bs_meanx), col="blue", lwd=2)
abline(v=mean(df$x), col="red", lwd=3)
hist(bs_meany, freq=FALSE, xlab="", ylab="", main=expression(paste("Sample mean of ",y)),
xlim=c(2,12), yaxt='n', breaks=20)
lines(density(bs_meany), col="blue", lwd=2)
abline(v=mean(df$y), col="red", lwd=3)
hist(bs_sx, freq=FALSE, xlab="", ylab="", main=expression(paste("Sample stdev of ",x)),
xlim=c(1,6), yaxt='n', breaks=40)
lines(density(bs_sx), col="blue", lwd=2)
abline(v=sd(df$x), col="red", lwd=3)
hist(bs_rxy, freq=FALSE, xlab="", ylab="",
main=expression(paste("Sample correlation of ",x," and ",y)),
xlim=c(-1,0), yaxt='n', breaks=40)
lines(density(bs_rxy), col="blue", lwd=2)
abline(v=cor(df$x,df$y), col="red", lwd=3)
set.seed(1234)
nobs <- 100
x <- rlnorm(nobs)
mean(x)
median(x)
# asymptotic standard error of sample mean
sd(x)/sqrt(nobs)
# use bootstrap sampling to calculate bootstrap standard errors
B <- 10000
bs_meanx <- rep(0,B)
bs_medianx <- rep(0,B)
for (i in 1:B) {
bs_index <- sample(1:nobs, nobs, replace = TRUE)
bs_x <- x[bs_index]
bs_meanx[i] <- mean(bs_x)
bs_medianx[i] <- median(bs_x)
}
print(paste("Bootstrap standard error of the sample mean:", round(sd(bs_meanx),4)))
print(paste("Bootstrap standard error of the sample median:", round(sd(bs_medianx),4)))
print(paste("Bootstrap standard error of the difference:", round(sd(bs_meanx-bs_medianx),4)))
set.seed(1234)
# calculate difference r(HD,LOW)-r(HD,BAC)
cor(sp500$HD,sp500$LOW)-cor(sp500$HD,sp500$BAC)
# initialize variables
nobs <- nrow(sp500)
B <- 1000
bs_rdiff <- rep(0,B)
# loop to create bootstrap samples and calculate the bootstrap statistic
for (i in 1:B) {
bs_index <- sample(1:nobs, nobs, replace = TRUE)
bs_df <- sp500[bs_index,]
bs_rdiff[i] <- cor(bs_df$HD,bs_df$LOW)-cor(bs_df$HD,bs_df$BAC)
}
# output the bootstrap standard error for r(HD,LOW)-r(HD,BAC)
sd(bs_rdiff)
set.seed(1234)
# construct a vector with the quantiles of interest (deciles and quartiles)
quantiles <- sort(c(seq(0.1,0.9,0.1),0.25,0.75))
# initialize variables
nobs <- nrow(cpsemployed)
B <- 5000
# loop over the quantiles of interest
for (q in quantiles) {
bs_quantile <- rep(0,B)
# loop to create bootstrap samples and calculate the bootstrap statistics
for (i in 1:B) {
bs_index <- sample(1:nobs, nobs, replace = TRUE)
bs_quantile[i] <- quantile(cpsemployed[bs_index,"earnwk"],q)
}
# output the bootstrap se and normal-based bootstrap CI for this quantile
quantile_estimate <- quantile(cpsemployed$earnwk,q)
print(paste("Quantile ", q, ": sample quantile ", round(quantile_estimate),
", bootstrap se ", round(sd(bs_quantile),1),
", 95% CI (", round(quantile_estimate-1.96*sd(bs_quantile),1),
",", round(quantile_estimate+1.96*sd(bs_quantile),1), ")", sep=""))
}
set.seed(1234)
# initialize variables
nobs <- nrow(cpsemployed)
B <- 5000
bs_iqr <- rep(0,B)
# loop to create bootstrap samples and calculate the bootstrap statistics
for (i in 1:B) {
bs_index <- sample(1:nobs, nobs, replace = TRUE)
bs_iqr[i] <- IQR(cpsemployed[bs_index,"earnwk"])
}
# output the bootstrap standard error and normal-based CI for the IQR
iqr_earnwk <- IQR(cpsemployed$earnwk)
print(paste("IQR: ", round(iqr_earnwk,1),
", bootstrap se ", round(sd(bs_iqr),1),
", 95% CI (", round(iqr_earnwk-1.96*sd(bs_iqr),1),
",", round(iqr_earnwk+1.96*sd(bs_iqr),1), ")", sep=""))
# output the 2.5% and 97.5% quantiles of the bootstrap distributions
quantile(bs_meanx, c(0.025,0.975))
quantile(bs_medianx, c(0.025,0.975))
quantile(bs_meanx-bs_medianx, c(0.025,0.975))
# output the 5% and 95% quantiles of the bootstrap distributions
quantile(bs_meanx, c(0.05,0.95))
quantile(bs_medianx, c(0.05,0.95))
quantile(bs_meanx-bs_medianx, c(0.05,0.95))
# output the 2.5% and 97.5% quantiles of the bootstrap distribution
quantile(bs_rdiff, c(0.025,0.975))
# output the 2.5% and 97.5% quantiles of the bootstrap distribution
quantile(bs_iqr, c(0.025,0.975))
qchisq(0.95,2)
qchisq(0.90,2)
par(mfrow=c(1,1))
x_chisq <- seq(0,20,by=0.01)
y_chisq <- dchisq(x_chisq,6)
plot(x_chisq,y_chisq,type="l",yaxs="i",ylim=c(0,0.15),ylab="",xlab="",xaxt="n",yaxt="n",main="")
axis(1,labels=FALSE,xaxt="n")
mtext(expression(w[paste(Q,",",0.05)]),side=1,at=qchisq(0.95,6),cex=0.7)
xshade <- seq(qchisq(0.95,6),20,length=100)
yshade <- dchisq(xshade,6)
polygon(c(qchisq(0.95,6),xshade,20),c(0,yshade,0),col="blue")
par(mfrow=c(1,1))
x_chisq <- seq(0,20,by=0.01)
y_chisq <- dchisq(x_chisq,6)
plot(x_chisq,y_chisq,type="l",yaxs="i",ylim=c(0,0.15),ylab="",xlab="",xaxt="n",yaxt="n",main="")
axis(1,labels=FALSE,xaxt="n")
mtext("Wald stat",side=1,at=qchisq(0.90,6),cex=0.7)
xshade <- seq(qchisq(0.90,6),20,length=100)
yshade <- dchisq(xshade,6)
polygon(c(qchisq(0.90,6),xshade,20),c(0,yshade,0),col="blue")
1-pchisq(11.17,2)
1-pchisq(81.59,3)
wald_test <- function(gamma_hat, var_gamma_hat, R=diag(length(gamma_hat)), r=rep(0,length(gamma_hat))) {
# gamma_hat: L x 1 vector of parameter estimates
# var_theta_hat: L x L variance-covariance matrix of the parameter estimates
# R: Q x L matrix of linear constraints to be tested
## default value of R is the identity matrix (for testing parameters equal to zero)
# r: Q x 1 vector of values of the linear constraints to be tested
## default value of r is a vector of zeros (for testing parameters equal to zero)
# when R has one row (one restriction), make sure R has matrix type
if (!is.matrix(R)) {
R <- t(as.matrix(R))
}
# calculate the Wald statistic using linear algebra
W <- t(R %*% gamma_hat - r) %*% solve(R %*% var_gamma_hat %*% t(R)) %*% (R %*% gamma_hat - r)
W <- as.numeric(W)
# calculate the p-value of the Wald test
p_value <- 1 - pchisq(W, nrow(R))
# return the Wald test statistic and p-value
return(list(W = W, p_value = p_value))
}
var_prop_indep <- function(pi_hat, nobs) {
# pi_hat: vector of sample proportions
# nobs: vector of sample sizes
# calculate the variance-covariance matrix
var_pi_hat <- diag(pi_hat*(1-pi_hat)/nobs)
return(var_pi_hat)
}
var_mean_indep <- function(x_vectors) {
# x_vectors: list of sample vectors
# initialize variables
num_means <- length(x_vectors)
tempvec <- rep(0, num_means)
# calculate the variance-covariance matrix
for (i in 1:num_means) {
tempvec[i] <- var(x_vectors[[i]])/length(x_vectors[[i]])
}
var_mean <- diag(tempvec)
return(var_mean)
}
var_mean_onesample <- function(df, vars) {
# df: data frame containing the sample
# vars: vector of variable names or indices
# calculate the variance-covariance matrix
var_mean <- cov(df[,vars])/nrow(df)
return(var_mean)
}
options(digits=5)
cps$union <- 1*(cps$unionstatus=="Union")
lm(earnwk~union, data=cps)
lm(earnwk~unionstatus, data=cps)
cigdata <- read.csv("cigdata2019.csv")
cigdata$cigsales <- cigdata$pack_sales_percapita
cigdata$cigtax <- cigdata$statetax_pack
lm(cigsales~cigtax, data=cigdata)
cigdata2019 <- read.csv("cigdata2019.csv",na.strings="",stringsAsFactors=TRUE)
par(mfrow = c(1,1))
plot(cigdata2019$statetax_pack,cigdata2019$pack_sales_percapita,cex=0.5,xlab="State cigarette tax (per pack)",ylab="2019 cigarette sales (packs per capita)",xaxt="n",yaxt="n")
abline(lm(cigdata2019$pack_sales_percapita~cigdata2019$statetax_pack),col="blue",lwd=2)
axis(1, at=seq(0,5,by=1), las=1)
axis(2, at=seq(0,80,by=20), las=1)
lm(HD~IDX, data=sp500)
sp500 <- read.csv("sp500-monthly-returns.csv",na.strings="",stringsAsFactors=TRUE)
par(mfrow = c(1,1))
plot(sp500$IDX,sp500$HD,cex=0.5,xlab="S&P 500 monthly return",ylab="Home Depot monthly return",xaxt="n",yaxt="n")
abline(lm(sp500$HD~sp500$IDX),col="blue",lwd=2)
axis(1, at=seq(-1,1,by=0.05), las=1)
axis(2, at=seq(-1,1,by=0.1), las=1)
abline(v=0,lty=3)
abline(h=0.008731551,lty=3)
# store results of least-squares estimation
results <- lm(cigsales~cigtax, data=cigdata)
# access estimated residuals and fitted values
uhat <- results$residuals
yhat <- results$fitted.values
# output first seven estimated residuals and fitted values
uhat[1:7]
yhat[1:7]
mean(uhat)
mean(cigdata$cigsales)
mean(yhat)
cor(uhat,cigdata$cigtax)
cor(uhat,yhat)
sighatsq_u <- sum(uhat^2)/(nrow(cigdata)-2)
sighat_u <- sqrt(sighatsq_u)
sighat_u
summary(results)$sigma
summary(results)
y <- cigdata$cigsales
x <- cigdata$cigtax
var(yhat)/var(y)
1-var(uhat)/var(y)
cor(y,yhat)^2
cor(y,x)^2
summary(results)$r.squared
# loop through the six stocks of interest
for (varname in c("HD","LOW","BAC","WFC","MRO","COP")) {
# run the SLR regression of company returns vs index (IDX) returns
results <- lm(sp500[,varname]~sp500$IDX)
# access statistics from the regression results
rsq <- summary(results)$r.squared
sighat_u <- summary(results)$sigma
# output the statistics
print(paste(varname,"versus index:", "R-sq", round(rsq,3), "sighat_u", round(sighat_u,4)))
}
set.seed(1234)
x <- runif(500)
y <- 1+8*x+rnorm(500)
y2 <- 1+8*x+rnorm(500,sd=2*x)
par(mfrow = c(2,1))
plot(x,y,ylab="y",xlab="x",cex=0.5,main="Sample 1 (homoskedastic)")
abline(1,8,col="blue",lwd=2)
plot(x,y2,ylab="y",xlab="x",cex=0.5,main="Sample 2 (heteroskedastic)")
abline(1,8,col="blue",lwd=2)
r = getOption("repos")
r["CRAN"] = "http://cran.us.r-project.org"
options(repos = r)
install.packages("estimatr")
library(estimatr)
options(scipen=0)
results <- lm_robust(cigsales~cigtax, data=cigdata)
results
summary(results)
results <- lm_robust(HD~IDX, data=sp500)
results
# output results with 95% confidence intervals (default alpha=0.05)
lm_robust(cigsales~cigtax, data=cigdata)
# output results with 90% confidence intervals (alpha=0.10)
lm_robust(cigsales~cigtax, data=cigdata, alpha=0.10)
results <- lm_robust(cigsales~cigtax, data=cigdata)
results
results <- lm_robust(HD~IDX, data=sp500)
results
results <- lm_robust(earnwk~union, data=cps)
results
lm(HD~IDX+LOW, data=sp500)
lm(cigsales~cigtax+producer, data=cigdata)
# define the female indicator variable
cps$female <- 1*(cps$gender=="Female")
# define the experience variable
cps$exper <- cps$age-cps$educ-6
# least-squares estimation of the MLR model
lm(earnwk~educ+exper+union+female, data=cps)
# store results of least-squares estimation
results <- lm(earnwk~educ+exper+union+female, data=cps)
# access estimated residuals and fitted values
uhat <- results$residuals
yhat <- results$fitted.values
# output first ten estimated residuals and fitted values, rounded to the nearest dollar
round(uhat[1:10])
round(yhat[1:10])
# store MLR estimation results (HD on IDX and LOW)
results_mlr <- lm(HD~IDX+LOW, data=sp500)
# store SLR estimation results (HD on IDX)
results_slr1 <- lm(HD~IDX, data=sp500)
# store SLR estimation results (HD on LOW)
results_slr2 <- lm(HD~LOW, data=sp500)
# output the R-squared values and estimated residual standard deviations
print(paste("MLR results (IDX, LOW)", "R-sq", round(summary(results_mlr)$r.squared,3),
"sighat_u", round(summary(results_mlr)$sigma,4)))
print(paste("SLR results (IDX)", "R-sq", round(summary(results_slr1)$r.squared,3),
"sighat_u", round(summary(results_slr1)$sigma,4)))
print(paste("SLR results (LOW)", "R-sq", round(summary(results_slr2)$r.squared,3),
"sighat_u", round(summary(results_slr2)$sigma,4)))
results <- lm_robust(cigsales~cigtax+producer, data=cigdata)
results
# MLR model of HD on IDX and LOW
mlr_results <- lm_robust(HD~IDX+LOW, data=sp500)
mlr_results
# SLR model of HD on IDX
slr_results1 <- lm_robust(HD~IDX, data=sp500)
slr_results1
# SLR model of HD on LOW
slr_results2 <- lm_robust(HD~LOW, data=sp500)
slr_results2
results <- lm_robust(earnwk~educ+exper+union+female, data=cps)
results
results <- lm_robust(earnwk~educ+exper+union+female, data=cps, alpha=0.10)
results
mlr_results <- lm_robust(HD~IDX+LOW, data=sp500)
mlr_results
mlr_results <- lm_robust(cigsales~cigtax+producer, data=cigdata)
mlr_results
linear_combination <- function(regresults, R) {
R <- t(as.matrix(R))
estimate <- R %*% regresults$coefficients
estimate <- as.numeric(estimate)
se <- sqrt(R %*% regresults$vcov %*% t(R))
se <- as.numeric(se)
pvalue <- 2*(1-pnorm(abs(estimate/se)))
return(list(estimate = estimate, se = se, pvalue = pvalue))
}
results <- lm_robust(HD~IDX+LOW, data=sp500)
linear_combination(results, c(0,1,-1))
results <- lm_robust(HD~IDX+LOW+BAC+WFC, data=sp500)
results
test_linear_restrictions <- function(regresults, R, r) {
return(wald_test(regresults$coefficients, regresults$vcov, R, r))
}
results <- lm_robust(HD~IDX+LOW+BAC+WFC, data=sp500)
# use rbind function to create a matrix consisting of the two restrictions:
# first restriction is beta3=0, second restriction is beta4=0
R <- rbind(c(0,0,0,1,0), c(0,0,0,0,1))
# create a vector of zeros for the two restrictions
r <- c(0,0)
# conduct the Wald test
test_linear_restrictions(results, R, r)
# create marital-status indicator variables
cps$married <- 1*(cps$marstatus=="Married")
cps$divorced <- 1*(cps$marstatus=="Divorced")
cps$widowed <- 1*(cps$marstatus=="Widowed")
cps$nevermarried <- 1*(cps$marstatus=="Never married")
# estimate Model I
model1 <- lm_robust(earnwk~married+divorced+widowed, data=cps)
model1
# estimate Model II
model2 <- lm_robust(earnwk~married+divorced+widowed+educ+exper+union+female, data=cps)
model2
results <- lm_robust(earnwk~educ+I(educ^2)+exper+union+female, data=cps)
results
summary(results)$r.squared
educ_vec <- c(10,12,14,16)
for (educ_star in educ_vec) {
print(linear_combination(results, c(0,1,2*educ_star+1,0,0,0)))
}
options(digits=4)
results <- lm_robust(earnwk~educ+exper+I(educ*exper)+union+female, data=cps)
results
summary(results)$r.squared
options(digits=5)
exper_vec <- c(10,20)
for (exper_star in exper_vec) {
print(linear_combination(results, c(0,1,0,exper_star,0,0)))
}
educ_vec <- c(12,14)
for (educ_star in educ_vec) {
print(linear_combination(results, c(0,0,1,educ_star,0,0)))
}
options(digits=4)
# create the post-2005 indicator variable
sp500$post2005 <- c(rep(0,180),rep(1,184))
# least-squares estimation for models with post-2005 interaction variable
results_HD <- lm_robust(HD~post2005+IDX+I(post2005*IDX), data=sp500)
results_HD
results_LOW <- lm_robust(LOW~post2005+IDX+I(post2005*IDX), data=sp500)
results_LOW
results_BAC <- lm_robust(BAC~post2005+IDX+I(post2005*IDX), data=sp500)
results_BAC
results_COP <- lm_robust(COP~post2005+IDX+I(post2005*IDX), data=sp500)
results_COP
# partial effect (with standard error) of IDX for post-2005 period in BAC model
linear_combination(results_BAC,c(0,0,1,1))
options(digits=5)
options(digits=4)
logwage_results <- lm_robust(I(log(earnwk))~educ+exper+union+female, data=cps)
logwage_results
logwage_results$r.squared
options(digits=5)
options(digits=4)
logwage_results <- lm_robust(I(log(earnwk))~educ+I(log(exper))+union+female, data=cps)
logwage_results
logwage_results$r.squared
options(digits=5)
options(digits=4)
lm_robust(I(log(cigsales))~I(log(cigtax)), data=cigdata)
options(digits=5)
# input the births2021.csv dataset
births2021 <- read.csv("births2021.csv")
# create the squared residuals
lmresults <- lm(bweight~age+hsgrad+somecoll+collgrad+married+smoke+male, data=births2021)
births2021$uhatsq <- (lmresults$residuals)^2
# MLR model with birthweight as the outcome
lm_robust(bweight~age+hsgrad+somecoll+collgrad+married+smoke+male, data=births2021)
# Var(U|X) model with squared residuals as the outcome
lm_robust(uhatsq~age+hsgrad+somecoll+collgrad+married+smoke+male, data=births2021)
# input the widgets.csv dataset
widgets <- read.csv("widgets.csv")
# estimate the LPM model
lpm_results <- lm_robust(purchase~emailA+emailB, data=widgets)
lpm_results
# test the equality of the emailA and emailB slopes
linear_combination(lpm_results, c(0,1,-1))
options(digits=4)
union_lpm <- lm_robust(union~educ+exper+I(exper^2)+female, data=cps)
union_lpm
options(digits=5)
linear_combination(union_lpm, c(0,0,1,31,0))
linear_combination(union_lpm, c(0,0,1,41,0))