Open Dataset: METD1_2024_Final_report_Arizona_new_cleaned_Sheet1

```{r}
BSkyloadDataset(fullpathfilename='D:/MPA Semester 1/Analytic methods/Final project/presentation/dataset/METD1_2024 - Final report Arizona new cleaned.xlsx', filetype='XLSX', worksheetName="Sheet1", replace_ds=TRUE, load.missing=FALSE, csvHeader=TRUE, character.to.factor=TRUE, isBasketData=FALSE, trimSPSStrailing=FALSE, sepChar=',', deciChar='.', encoding="NULL", datasetName='METD1_2024_Final_report_Arizona_new_cleaned_Sheet1')
BSkyLoadRefresh("METD1_2024_Final_report_Arizona_new_cleaned_Sheet1")
```
Successfully opened using:
[1] "readxl::read_excel(path='D:/MPA Semester 1/Analytic methods/Final project/presentation/dataset/METD1_2024 - Final report Arizona new cleaned.xlsx',sheet='Sheet1', col_names=TRUE)"

Table, Basic

```{r}

library(arsenal)
library(lubridate)
library(dplyr)

# defining custom statistic functions
trimmean_func <- function(x, weights=rep(1,length(x)),...) {
  meanest <- DescTools::MeanCI(x, trim=5/100, na.rm=TRUE, conf.level=0.95)
  as.tbstat(meanest, sep=" ", parens=c("(", ")"), sep2=", ")
}

pct1_func <- function(x, weights=rep(1,length(x)),...) {
  quantest <- DescTools::QuantileCI(x, prob=.01, na.rm=TRUE, conf.level=0.95)
  as.tbstat(quantest, sep=" ", parens=c("(", ")"), sep2=", ")
}

pct5_func <- function(x, weights=rep(1,length(x)),...) {
  quantest <- DescTools::QuantileCI(x, prob=.05, na.rm=TRUE, conf.level=0.95)
  as.tbstat(quantest, sep=" ", parens=c("(", ")"), sep2=", ")
}

pct10_func <- function(x, weights=rep(1,length(x)),...) {
  quantest <- DescTools::QuantileCI(x, prob=.1, na.rm=TRUE, conf.level=0.95)
  as.tbstat(quantest, sep=" ", parens=c("(", ")"), sep2=", ")
}

pct25_func <- function(x, weights=rep(1,length(x)),...) {
  quantest <- DescTools::QuantileCI(x, prob=.25, na.rm=TRUE, conf.level=0.95)
  as.tbstat(quantest, sep=" ", parens=c("(", ")"), sep2=", ")
}

pct33_func <- function(x, weights=rep(1,length(x)),...) {
  quantest <- DescTools::QuantileCI(x, prob=1/3, na.rm=TRUE, conf.level=0.95)
  as.tbstat(quantest, sep=" ", parens=c("(", ")"), sep2=", ")
}

pct50_func <- function(x, weights=rep(1,length(x)),...) {
  quantest <- DescTools::QuantileCI(x, prob=.5, na.rm=TRUE, conf.level=0.95)
  as.tbstat(quantest, sep=" ", parens=c("(", ")"), sep2=", ")
}

pct66_func <- function(x, weights=rep(1,length(x)),...) {
  quantest <- DescTools::QuantileCI(x, prob=2/3, na.rm=TRUE, conf.level=0.95)
  as.tbstat(quantest, sep=" ", parens=c("(", ")"), sep2=", ")
}

pct75_func <- function(x, weights=rep(1,length(x)),...) {
  quantest <- DescTools::QuantileCI(x, prob=.75, na.rm=TRUE, conf.level=0.95)
  as.tbstat(quantest, sep=" ", parens=c("(", ")"), sep2=", ")
}

pct90_func <- function(x, weights=rep(1,length(x)),...) {
  quantest <- DescTools::QuantileCI(x, prob=.9, na.rm=TRUE, conf.level=0.95)
  as.tbstat(quantest, sep=" ", parens=c("(", ")"), sep2=", ")
}

pct95_func <- function(x, weights=rep(1,length(x)),...) {
  quantest <- DescTools::QuantileCI(x, prob=.95, na.rm=TRUE, conf.level=0.95)
  as.tbstat(quantest, sep=" ", parens=c("(", ")"), sep2=", ")
}

pct99_func <- function(x, weights=rep(1,length(x)),...) {
  quantest <- DescTools::QuantileCI(x, prob=.99, na.rm=TRUE, conf.level=0.95)
  as.tbstat(quantest, sep=" ", parens=c("(", ")"), sep2=", ")
}

custquant_func <- function(x, weights=rep(1,length(x)),...) {
  quantest <- DescTools::QuantileCI(x, prob=20/100, na.rm=TRUE, conf.level=0.95)
  as.tbstat(quantest, sep=" ", parens=c("(", ")"), sep2=", ")
}

cv_func <- function(x, weights=rep(1,length(x)),...) {
  sd(x, na.rm=TRUE)/mean(x, na.rm=TRUE)
}

skewness_func <- function(x, weights=rep(1,length(x)),...) {
	DescTools::Skew(x, na.rm=TRUE)
}

kurtosis_func <- function(x, weights=rep(1,length(x)),...) {
	DescTools::Kurt(x, na.rm=TRUE)
}

# changing all date variables to Date class for tableby
METD1_2024_Final_report_Arizona_new_cleaned_Sheet1.1 <- METD1_2024_Final_report_Arizona_new_cleaned_Sheet1 %>%
	mutate_if(is.timepoint,as.Date)

tab1 <- tableby( ~ sex+race+uhrswork+inctot, data=METD1_2024_Final_report_Arizona_new_cleaned_Sheet1.1, test=FALSE, total=FALSE,  
	numeric.stats=c('N','mean','min','max','sd'),
	cat.stats=c('Nmiss','countpct'),
	ordered.stats=c('Nmiss','countpct'),
	date.stats=c('Nmiss','median','range'),
	numeric.test="anova", cat.test="chisq", ordered.test="trend", 
	chisq.correct=FALSE, digits=3, digits.pct=1, digits.p=3, simulate.p.value=FALSE, B=2000, 
	conf.level=0.95, numeric.simplify=FALSE, cat.simplify=FALSE, ordered.simplify=FALSE, 
	date.simplify=FALSE)

tab1.final <- as.data.frame(summary(tab1, text=TRUE,   pfootnote=FALSE,
	stats.labels=list(trimmean_func="5% Trimmed Mean (CI)", pct1_func="1st %-ile (CI)", pct5_func="5th %-ile (CI)", 
	pct10_func="10th %-ile (CI)", pct25_func="25th %-ile (CI)", pct33_func="1st Tertile (CI)", pct50_func="Median (CI)", pct66_func="2nd Tertile (CI)", 
	pct75_func="75th %-ile (CI)", pct90_func="90th %-ile (CI)", pct95_func="95th %-ile (CI)", pct99_func="99th %-ile (CI)", custquant_func="20 %-ile (CI)", 
	cv_func="CV", skewness_func="Skewness", kurtosis_func="Kurtosis")))

BSkyFormat(tab1.final, decimalDigitsRounding=-1, singleTableOutputHeader="Variable Summaries ")
```
Attaching package: 'arsenal'
The following object is masked from 'package:lubridate':
    is.Date
Variable Summaries
Overall (N=10876)
sex
- female 5082 (46.7%)
- male 5794 (53.3%)
race
- american indian or alaska native 236 (2.2%)
- black/african american 536 (4.9%)
- chinese 85 (0.8%)
- japanese 24 (0.2%)
- multiracial 2074 (19.1%)
- other asian or pacific islander 376 (3.5%)
- other race, nec 1269 (11.7%)
- white 6276 (57.7%)
uhrswork
- N 10876
- Mean 39.072
- Min 1.000
- Max 99.000
- SD 11.643
inctot
- N 10876
- Mean 58272.646
- Min 4.000
- Max 883000.000
- SD 71376.753

Bar Chart

```{r}

	## [Bar Chart (with means)]
	require(ggplot2);
	require(ggthemes);
	require(Rmisc);
	temp <- METD1_2024_Final_report_Arizona_new_cleaned_Sheet1             
temp <-Rmisc::summarySE( METD1_2024_Final_report_Arizona_new_cleaned_Sheet1, measurevar = "inctot", groupvars = c("race..abb","sex"),  na.rm = TRUE, .drop = TRUE)
	pd <- position_dodge(0.9)
	temp  %>% ggplot( aes(x=race..abb,y=inctot,fill=sex)) +
	geom_bar( position="dodge",alpha=1,stat="identity") +                  labs(x="race..abb",y="inctot",fill="sex",title= "Bar Chart (with means) for X axis: race..abb,Y axis: inctot, Fill: sex") +
	xlab("Race") +
	ylab("Personal Income") +
	ggtitle("The impact of gender and race on personal income") + 
	theme_grey() + 
	theme(text=element_text(family="sans",
	face="plain",
	color="#000000",  size=12,
	hjust=0.5, vjust=0.5),
	plot.title=element_text(hjust=.5, size=18))
```
Loading required package: ggthemes
Loading required package: Rmisc
Loading required package: lattice
Loading required package: plyr
------------------------------------------------------------------------------
You have loaded plyr after dplyr - this is likely to cause problems.
If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
library(plyr); library(dplyr)
------------------------------------------------------------------------------
Attaching package: 'plyr'
The following object is masked from 'package:purrr':
    compact
The following objects are masked from 'package:dplyr':
    arrange, count, desc, failwith, id, mutate, rename, summarise,
    summarize

Linear Regression

```{r}

require(magrittr)
require(equatiomatic)
require(textutils)
#LinearRegModel1 = METD1_2024_Final_report_Arizona_new_cleaned_Sheet1 %>%
                        #  lm(formula = inctot~0+sex+race+uhrswork, data=.,
                        # na.action=na.exclude)   
   
LinearRegModel1 = lm(formula = inctot~sex+race+uhrswork, data=METD1_2024_Final_report_Arizona_new_cleaned_Sheet1,
     na.action=na.exclude)

#Display theoretical model equation and coefficients

#Display theoretical model
LinearRegModel1 %>%
    equatiomatic::extract_eq(raw_tex = FALSE,
        wrap = TRUE, intercept = "alpha", ital_vars = FALSE) %>%
        BSkyFormat()       

#Display coefficients
LinearRegModel1 %>%
    equatiomatic::extract_eq(use_coefs = TRUE,
    wrap = TRUE,  ital_vars = FALSE, coef_digits = BSkyGetDecimalDigitSetting()) %>%
       BSkyFormat()

LinearRegModel1 %>%
    BSkyFormat()      

#Displaying the Anova table
AnovaRes = LinearRegModel1 %>%
            anova() %T>%
			  BSkyFormat(singleTableOutputHeader = "Anova table")

#Displaying sum of squares table
df = as.data.frame(AnovaRes)
totalrows = nrow(df)
regSumOfSquares = sum(df[1:totalrows - 1, 2])
residualSumOfSquares = df[totalrows, 2]
totalSumOfSquares = regSumOfSquares + residualSumOfSquares

matrix(c(regSumOfSquares, residualSumOfSquares, 
        totalSumOfSquares), nrow = 3, ncol = 1, dimnames = list(c("Sum of squares of regression", 
        "Sum of squares of residuals", "Total sum of squares"), 
        c("Values"))) %>%
            BSkyFormat(singleTableOutputHeader = "Sum of squares table")
 
# Model Plotting

#Adding attributes to support scoring
#We don't add dependent and independent variables as this is handled by our functions
attr(.GlobalEnv$LinearRegModel1,"classDepVar")= class(METD1_2024_Final_report_Arizona_new_cleaned_Sheet1[, c("inctot")])
attr(.GlobalEnv$LinearRegModel1,"depVarSample")= sample(METD1_2024_Final_report_Arizona_new_cleaned_Sheet1[, c("inctot")], size = 2, replace = TRUE)
#remove(LinearRegModel1)
```
inctot=α+β1(sexmale)+β2(raceblack/african american)+β3(racechinese) +β4(racejapanese)+β5(racemultiracial)+β6(raceother asian or pacific islander)+β7(raceother , nec) +β8(racewhite)+β9(uhrswork)+ϵ ˆinctot=29410.8853+12560.6976(sexmale)+2118.4014(raceblack/african american)+26874.5039(racechinese) +2129.5573(racejapanese)1104.4504(racemultiracial)+24763.2397(raceother asian or pacific islander)4497.6828(raceother , nec) +23684.099(racewhite)+1711.8633(uhrswork)
Model: lm(formula = inctot ~ sex + race + uhrswork, data = METD1_2024_Final_report_Arizona_new_cleaned_Sheet1, na.action = na.exclude)
LM Summary
Residual Std. Error df R-squared Adjusted R-squared F-statistic numdf dendf p-value
66865.9300 10866 0.1231 0.1224 169.5289 9 10866 < .001***
Note:
Signif. codes: 0 ' *** ' 0.001 ' ** ' 0.01 ' * ' 0.05 '.' 0.1 ' ' 1
Residuals
Min 1Q Median 3Q Max
-143583.0000 -30308.4400 -12126.6600 10133.6200 790572.9000
Coefficients
Estimate Std. Error t value Pr(>|t|) 2.5 % 97.5 %
(Intercept) -29410.8900 4856.4350 -6.0561 < .001*** -38930.3800 -19891.3900
sexmale 12560.7000 1297.2020 9.6829 < .001*** 10017.9500 15103.4500
raceblack/african american 2118.4010 5223.7590 0.4055 0.6851 -8121.1190 12357.9200
racechinese 26874.5000 8460.0350 3.1766 0.0015 ** 10291.2900 43457.7100
racejapanese 2129.5570 14326.2200 0.1486 0.8818 -25952.4400 30211.5500
racemultiracial -1104.4500 4594.4690 -0.2404 0.8100 -10110.4500 7901.5460
raceother asian or pacific islander 24763.2400 5553.2030 4.4593 < .001*** 13877.9500 35648.5300
raceother race, nec -4497.6830 4740.7420 -0.9487 0.3428 -13790.4000 4795.0370
racewhite 23684.1000 4434.0500 5.3414 < .001*** 14992.5500 32375.6500
uhrswork 1711.8630 55.5671 30.8071 < .001*** 1602.9420 1820.7850
Note:
Signif. codes: 0 ' *** ' 0.001 ' ** ' 0.01 ' * ' 0.05 '.' 0.1 ' ' 1
Anova table
Df Sum Sq Mean Sq F value Pr(>F)
sex 1 815345666760.0000 815345666760.0000 182.3610 < .001***
race 7 1763028594059.0000 251861227723.0000 56.3315 < .001***
uhrswork 1 4243381182725.0000 4243381182725.0000 949.0787 < .001***
Residuals 10866 48582463786556.0000 4471053174.0000 NA NA
Note:
Signif. codes: 0 ' *** ' 0.001 ' ** ' 0.01 ' * ' 0.05 '.' 0.1 ' ' 1
Sum of squares table
Values
Sum of squares of regression 6821755443544.0000
Sum of squares of residuals 48582463786556.0000
Total sum of squares 55404219230100.0000