Open Dataset: METD1_2024_Final_report_Arizona_new_cleaned_Sheet1
```{r}
BSkyloadDataset(fullpathfilename='D:/MPA Semester 1/Analytic methods/Final project/presentation/dataset/METD1_2024 - Final report Arizona new cleaned.xlsx', filetype='XLSX', worksheetName="Sheet1", replace_ds=TRUE, load.missing=FALSE, csvHeader=TRUE, character.to.factor=TRUE, isBasketData=FALSE, trimSPSStrailing=FALSE, sepChar=',', deciChar='.', encoding="NULL", datasetName='METD1_2024_Final_report_Arizona_new_cleaned_Sheet1')
BSkyLoadRefresh("METD1_2024_Final_report_Arizona_new_cleaned_Sheet1")
```
Successfully opened using:
[1] "readxl::read_excel(path='D:/MPA Semester 1/Analytic methods/Final project/presentation/dataset/METD1_2024 - Final report Arizona new cleaned.xlsx',sheet='Sheet1', col_names=TRUE)"
Table, Basic
```{r}
library(arsenal)
library(lubridate)
library(dplyr)
# defining custom statistic functions
trimmean_func <- function(x, weights=rep(1,length(x)),...) {
meanest <- DescTools::MeanCI(x, trim=5/100, na.rm=TRUE, conf.level=0.95)
as.tbstat(meanest, sep=" ", parens=c("(", ")"), sep2=", ")
}
pct1_func <- function(x, weights=rep(1,length(x)),...) {
quantest <- DescTools::QuantileCI(x, prob=.01, na.rm=TRUE, conf.level=0.95)
as.tbstat(quantest, sep=" ", parens=c("(", ")"), sep2=", ")
}
pct5_func <- function(x, weights=rep(1,length(x)),...) {
quantest <- DescTools::QuantileCI(x, prob=.05, na.rm=TRUE, conf.level=0.95)
as.tbstat(quantest, sep=" ", parens=c("(", ")"), sep2=", ")
}
pct10_func <- function(x, weights=rep(1,length(x)),...) {
quantest <- DescTools::QuantileCI(x, prob=.1, na.rm=TRUE, conf.level=0.95)
as.tbstat(quantest, sep=" ", parens=c("(", ")"), sep2=", ")
}
pct25_func <- function(x, weights=rep(1,length(x)),...) {
quantest <- DescTools::QuantileCI(x, prob=.25, na.rm=TRUE, conf.level=0.95)
as.tbstat(quantest, sep=" ", parens=c("(", ")"), sep2=", ")
}
pct33_func <- function(x, weights=rep(1,length(x)),...) {
quantest <- DescTools::QuantileCI(x, prob=1/3, na.rm=TRUE, conf.level=0.95)
as.tbstat(quantest, sep=" ", parens=c("(", ")"), sep2=", ")
}
pct50_func <- function(x, weights=rep(1,length(x)),...) {
quantest <- DescTools::QuantileCI(x, prob=.5, na.rm=TRUE, conf.level=0.95)
as.tbstat(quantest, sep=" ", parens=c("(", ")"), sep2=", ")
}
pct66_func <- function(x, weights=rep(1,length(x)),...) {
quantest <- DescTools::QuantileCI(x, prob=2/3, na.rm=TRUE, conf.level=0.95)
as.tbstat(quantest, sep=" ", parens=c("(", ")"), sep2=", ")
}
pct75_func <- function(x, weights=rep(1,length(x)),...) {
quantest <- DescTools::QuantileCI(x, prob=.75, na.rm=TRUE, conf.level=0.95)
as.tbstat(quantest, sep=" ", parens=c("(", ")"), sep2=", ")
}
pct90_func <- function(x, weights=rep(1,length(x)),...) {
quantest <- DescTools::QuantileCI(x, prob=.9, na.rm=TRUE, conf.level=0.95)
as.tbstat(quantest, sep=" ", parens=c("(", ")"), sep2=", ")
}
pct95_func <- function(x, weights=rep(1,length(x)),...) {
quantest <- DescTools::QuantileCI(x, prob=.95, na.rm=TRUE, conf.level=0.95)
as.tbstat(quantest, sep=" ", parens=c("(", ")"), sep2=", ")
}
pct99_func <- function(x, weights=rep(1,length(x)),...) {
quantest <- DescTools::QuantileCI(x, prob=.99, na.rm=TRUE, conf.level=0.95)
as.tbstat(quantest, sep=" ", parens=c("(", ")"), sep2=", ")
}
custquant_func <- function(x, weights=rep(1,length(x)),...) {
quantest <- DescTools::QuantileCI(x, prob=20/100, na.rm=TRUE, conf.level=0.95)
as.tbstat(quantest, sep=" ", parens=c("(", ")"), sep2=", ")
}
cv_func <- function(x, weights=rep(1,length(x)),...) {
sd(x, na.rm=TRUE)/mean(x, na.rm=TRUE)
}
skewness_func <- function(x, weights=rep(1,length(x)),...) {
DescTools::Skew(x, na.rm=TRUE)
}
kurtosis_func <- function(x, weights=rep(1,length(x)),...) {
DescTools::Kurt(x, na.rm=TRUE)
}
# changing all date variables to Date class for tableby
METD1_2024_Final_report_Arizona_new_cleaned_Sheet1.1 <- METD1_2024_Final_report_Arizona_new_cleaned_Sheet1 %>%
mutate_if(is.timepoint,as.Date)
tab1 <- tableby( ~ sex+race+uhrswork+inctot, data=METD1_2024_Final_report_Arizona_new_cleaned_Sheet1.1, test=FALSE, total=FALSE,
numeric.stats=c('N','mean','min','max','sd'),
cat.stats=c('Nmiss','countpct'),
ordered.stats=c('Nmiss','countpct'),
date.stats=c('Nmiss','median','range'),
numeric.test="anova", cat.test="chisq", ordered.test="trend",
chisq.correct=FALSE, digits=3, digits.pct=1, digits.p=3, simulate.p.value=FALSE, B=2000,
conf.level=0.95, numeric.simplify=FALSE, cat.simplify=FALSE, ordered.simplify=FALSE,
date.simplify=FALSE)
tab1.final <- as.data.frame(summary(tab1, text=TRUE, pfootnote=FALSE,
stats.labels=list(trimmean_func="5% Trimmed Mean (CI)", pct1_func="1st %-ile (CI)", pct5_func="5th %-ile (CI)",
pct10_func="10th %-ile (CI)", pct25_func="25th %-ile (CI)", pct33_func="1st Tertile (CI)", pct50_func="Median (CI)", pct66_func="2nd Tertile (CI)",
pct75_func="75th %-ile (CI)", pct90_func="90th %-ile (CI)", pct95_func="95th %-ile (CI)", pct99_func="99th %-ile (CI)", custquant_func="20 %-ile (CI)",
cv_func="CV", skewness_func="Skewness", kurtosis_func="Kurtosis")))
BSkyFormat(tab1.final, decimalDigitsRounding=-1, singleTableOutputHeader="Variable Summaries ")
```
Attaching package: 'arsenal'
The following object is masked from 'package:lubridate':
is.Date
Variable Summaries
|
Overall (N=10876) |
sex |
|
- female |
5082 (46.7%) |
- male |
5794 (53.3%) |
race |
|
- american indian or alaska native |
236 (2.2%) |
- black/african american |
536 (4.9%) |
- chinese |
85 (0.8%) |
- japanese |
24 (0.2%) |
- multiracial |
2074 (19.1%) |
- other asian or pacific islander |
376 (3.5%) |
- other race, nec |
1269 (11.7%) |
- white |
6276 (57.7%) |
uhrswork |
|
- N |
10876 |
- Mean |
39.072 |
- Min |
1.000 |
- Max |
99.000 |
- SD |
11.643 |
inctot |
|
- N |
10876 |
- Mean |
58272.646 |
- Min |
4.000 |
- Max |
883000.000 |
- SD |
71376.753 |
Bar Chart
```{r}
## [Bar Chart (with means)]
require(ggplot2);
require(ggthemes);
require(Rmisc);
temp <- METD1_2024_Final_report_Arizona_new_cleaned_Sheet1
temp <-Rmisc::summarySE( METD1_2024_Final_report_Arizona_new_cleaned_Sheet1, measurevar = "inctot", groupvars = c("race..abb","sex"), na.rm = TRUE, .drop = TRUE)
pd <- position_dodge(0.9)
temp %>% ggplot( aes(x=race..abb,y=inctot,fill=sex)) +
geom_bar( position="dodge",alpha=1,stat="identity") + labs(x="race..abb",y="inctot",fill="sex",title= "Bar Chart (with means) for X axis: race..abb,Y axis: inctot, Fill: sex") +
xlab("Race") +
ylab("Personal Income") +
ggtitle("The impact of gender and race on personal income") +
theme_grey() +
theme(text=element_text(family="sans",
face="plain",
color="#000000", size=12,
hjust=0.5, vjust=0.5),
plot.title=element_text(hjust=.5, size=18))
```
Loading required package: ggthemes
Loading required package: Rmisc
Loading required package: lattice
Loading required package: plyr
------------------------------------------------------------------------------
You have loaded plyr after dplyr - this is likely to cause problems.
If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
library(plyr); library(dplyr)
------------------------------------------------------------------------------
Attaching package: 'plyr'
The following object is masked from 'package:purrr':
compact
The following objects are masked from 'package:dplyr':
arrange, count, desc, failwith, id, mutate, rename, summarise,
summarize
Linear Regression
```{r}
require(magrittr)
require(equatiomatic)
require(textutils)
#LinearRegModel1 = METD1_2024_Final_report_Arizona_new_cleaned_Sheet1 %>%
# lm(formula = inctot~0+sex+race+uhrswork, data=.,
# na.action=na.exclude)
LinearRegModel1 = lm(formula = inctot~sex+race+uhrswork, data=METD1_2024_Final_report_Arizona_new_cleaned_Sheet1,
na.action=na.exclude)
#Display theoretical model equation and coefficients
#Display theoretical model
LinearRegModel1 %>%
equatiomatic::extract_eq(raw_tex = FALSE,
wrap = TRUE, intercept = "alpha", ital_vars = FALSE) %>%
BSkyFormat()
#Display coefficients
LinearRegModel1 %>%
equatiomatic::extract_eq(use_coefs = TRUE,
wrap = TRUE, ital_vars = FALSE, coef_digits = BSkyGetDecimalDigitSetting()) %>%
BSkyFormat()
LinearRegModel1 %>%
BSkyFormat()
#Displaying the Anova table
AnovaRes = LinearRegModel1 %>%
anova() %T>%
BSkyFormat(singleTableOutputHeader = "Anova table")
#Displaying sum of squares table
df = as.data.frame(AnovaRes)
totalrows = nrow(df)
regSumOfSquares = sum(df[1:totalrows - 1, 2])
residualSumOfSquares = df[totalrows, 2]
totalSumOfSquares = regSumOfSquares + residualSumOfSquares
matrix(c(regSumOfSquares, residualSumOfSquares,
totalSumOfSquares), nrow = 3, ncol = 1, dimnames = list(c("Sum of squares of regression",
"Sum of squares of residuals", "Total sum of squares"),
c("Values"))) %>%
BSkyFormat(singleTableOutputHeader = "Sum of squares table")
# Model Plotting
#Adding attributes to support scoring
#We don't add dependent and independent variables as this is handled by our functions
attr(.GlobalEnv$LinearRegModel1,"classDepVar")= class(METD1_2024_Final_report_Arizona_new_cleaned_Sheet1[, c("inctot")])
attr(.GlobalEnv$LinearRegModel1,"depVarSample")= sample(METD1_2024_Final_report_Arizona_new_cleaned_Sheet1[, c("inctot")], size = 2, replace = TRUE)
#remove(LinearRegModel1)
```
inctot=α+β1(sexmale)+β2(raceblack/african american)+β3(racechinese) +β4(racejapanese)+β5(racemultiracial)+β6(raceother asian or pacific islander)+β7(raceother , nec) +β8(racewhite)+β9(uhrswork)+ϵ
ˆinctot=−29410.8853+12560.6976(sexmale)+2118.4014(raceblack/african american)+26874.5039(racechinese) +2129.5573(racejapanese)−1104.4504(racemultiracial)+24763.2397(raceother asian or pacific islander)−4497.6828(raceother , nec) +23684.099(racewhite)+1711.8633(uhrswork)
Model: lm(formula = inctot ~ sex + race + uhrswork, data = METD1_2024_Final_report_Arizona_new_cleaned_Sheet1, na.action = na.exclude)
LM Summary |
Residual Std. Error |
df |
R-squared |
Adjusted R-squared |
F-statistic |
numdf |
dendf |
p-value |
66865.9300 |
10866 |
0.1231 |
0.1224 |
169.5289 |
9 |
10866 |
< .001*** |
Note: |
Signif. codes: 0 ' *** ' 0.001 ' ** ' 0.01 ' * ' 0.05 '.' 0.1 ' ' 1 |
Residuals
Min |
1Q |
Median |
3Q |
Max |
-143583.0000 |
-30308.4400 |
-12126.6600 |
10133.6200 |
790572.9000 |
Coefficients
|
Estimate |
Std. Error |
t value |
Pr(>|t|) |
2.5 % |
97.5 % |
(Intercept) |
-29410.8900 |
4856.4350 |
-6.0561 |
< .001*** |
-38930.3800 |
-19891.3900 |
sexmale |
12560.7000 |
1297.2020 |
9.6829 |
< .001*** |
10017.9500 |
15103.4500 |
raceblack/african american |
2118.4010 |
5223.7590 |
0.4055 |
0.6851 |
-8121.1190 |
12357.9200 |
racechinese |
26874.5000 |
8460.0350 |
3.1766 |
0.0015 ** |
10291.2900 |
43457.7100 |
racejapanese |
2129.5570 |
14326.2200 |
0.1486 |
0.8818 |
-25952.4400 |
30211.5500 |
racemultiracial |
-1104.4500 |
4594.4690 |
-0.2404 |
0.8100 |
-10110.4500 |
7901.5460 |
raceother asian or pacific islander |
24763.2400 |
5553.2030 |
4.4593 |
< .001*** |
13877.9500 |
35648.5300 |
raceother race, nec |
-4497.6830 |
4740.7420 |
-0.9487 |
0.3428 |
-13790.4000 |
4795.0370 |
racewhite |
23684.1000 |
4434.0500 |
5.3414 |
< .001*** |
14992.5500 |
32375.6500 |
uhrswork |
1711.8630 |
55.5671 |
30.8071 |
< .001*** |
1602.9420 |
1820.7850 |
Note: |
Signif. codes: 0 ' *** ' 0.001 ' ** ' 0.01 ' * ' 0.05 '.' 0.1 ' ' 1 |
Anova table
|
Df |
Sum Sq |
Mean Sq |
F value |
Pr(>F) |
sex |
1 |
815345666760.0000 |
815345666760.0000 |
182.3610 |
< .001*** |
race |
7 |
1763028594059.0000 |
251861227723.0000 |
56.3315 |
< .001*** |
uhrswork |
1 |
4243381182725.0000 |
4243381182725.0000 |
949.0787 |
< .001*** |
Residuals |
10866 |
48582463786556.0000 |
4471053174.0000 |
NA |
NA |
Note: |
Signif. codes: 0 ' *** ' 0.001 ' ** ' 0.01 ' * ' 0.05 '.' 0.1 ' ' 1 |
Sum of squares table
|
Values |
Sum of squares of regression |
6821755443544.0000 |
Sum of squares of residuals |
48582463786556.0000 |
Total sum of squares |
55404219230100.0000 |