library(tidyverse)
library(tidyr)
library(readxl)
library(labelled)
library(expss)
# load data
<- read_excel("input/indicators_data.xlsx")
indicators_data
# load XLSForm
<- read_excel("input/tool/MoDa_Tool.xlsx", sheet = "survey")
survey <- read_excel("input/tool/MoDa_Tool.xlsx", sheet = "choices") choices
4 Data Analysis
Analysis are produced with a purpose and must respect certain imperatives to ensure final usage. this chapter will guide you through running analysis and produce results. You have collected and processed your data and now the fun starts. Data analysis is the practice of applying statistical and analytically tools to a dataset to serve actionable business insights.
4.1 Categorical Analysis
4.1.1 Select One
4.1.2 Select Multiple
<-function(data,labelset,id_string=NULL,multicolumn=NULL,other_string=NULL,retain_empty=TRUE,language="English"){
multi_response_odk_single
if(!paste0("label::",language)%in%colnames(labelset)){
<-gsub("label::","",colnames(select(labelset,contains("label")))[1])
newlanguage
warning(paste("language",language,"not found. Using first column labelled",newlanguage,"instead"))
<-newlanguage
language
}
if((is.null(id_string) & is.null(multicolumn))|(!is.null(id_string) & !is.null(multicolumn))){
stop("Must include input for id_string or multicolumn")
}
if(is.null(multicolumn)){
%>%
data mutate(group=1) %>%
select(X_uuid,group,contains(id_string)) %>%
pivot_longer(contains(id_string),names_to = "value",values_to = "exist") %>%
filter(exist==1) %>%
mutate(value=str_remove(value,id_string)) %>%
select(-exist) %>%
full_join(
by=c("value"="name"))->long_responses
labelset,
}else{
%>%
data separate(multicolumn,sep=" ",into=paste0(multicolumn,1:(1+max(str_count(.[,multicolumn]," "))))) %>%
select(X_uuid,contains(multicolumn)) %>%
pivot_longer(contains(multicolumn),names_to = "selection",values_to = "value",values_drop_na = TRUE) %>%
select(-selection) %>%
mutate(group=1) %>%
full_join(
by=c("value"="name"))->long_responses
labelset,
}
<- data %>%
overall_respondents_bygroup mutate(group=1) %>%
group_by(group) %>%
summarise(N_respondents=n())
%>%
long_responses mutate(count=ifelse(is.na(group),0,1)) %>%
mutate(group=1) %>%
group_by(group) %>%
mutate(N_responses=sum(count)) %>%
ungroup() %>%
select(value,group,label=contains(paste0("label::",language)),count,N_responses) %>%
group_by(value,group,label,N_responses) %>%
summarise(n=sum(count)) %>%
ungroup() %>%
full_join(overall_respondents_bygroup,by="group") %>%
mutate(per_responses=n/N_responses,
per_respondents=n/N_respondents) %>%
ungroup() %>%
select(value,label,n,per_responses,per_respondents,N_responses,N_respondents) %>%
arrange(desc(per_responses))->output
if(retain_empty==FALSE){
<-filter(output,per_responses>0)
output
}
return(output)
}
<-function(data,labelset,id_string=NULL,multicolumn=NULL,group=NULL,other_string=NULL,retain_empty=TRUE,language="English"){
multi_response_odk
if(!paste0("label::",language)%in%colnames(labelset)){
<-gsub("label::","",colnames(select(labelset,contains("label")))[1])
newlanguagewarning(paste("language",language,"not found. Using first column labelled",newlanguage,"instead"))
}
if(!is.null(group)){
<- data %>%
outputsplit(data[,group],) %>%
map(.f=multi_response_odk_single,labelset,id_string,multicolumn,other_string,retain_empty,language) %>%
list_rbind(names_to = group) %>%
suppressMessages() %>% suppressWarnings()
}else{
<- multi_response_odk_single(data,labelset,id_string,multicolumn,other_string,retain_empty,language) %>%
outputsuppressMessages() %>% suppressWarnings()
}return(output)
}
<-filter(choices,list_name=="OtherCBTSrc")
labels
# fixing the bug
$HHAsstOthCBTRecName.100 <- as.character(indicators_data$HHAsstOthCBTRecName.100)
indicators_data$HHAsstOthCBTRecName.201 <- as.character(indicators_data$HHAsstOthCBTRecName.201)
indicators_data$HHAsstOthCBTRecName.202 <- as.character(indicators_data$HHAsstOthCBTRecName.202)
indicators_data$HHAsstOthCBTRecName.203 <- as.character(indicators_data$HHAsstOthCBTRecName.203)
indicators_data$HHAsstOthCBTRecName.204 <- as.character(indicators_data$HHAsstOthCBTRecName.204)
indicators_data$HHAsstOthCBTRecName.300 <- as.character(indicators_data$HHAsstOthCBTRecName.300)
indicators_data$HHAsstOthCBTRecName.999 <- as.character(indicators_data$HHAsstOthCBTRecName.999)
indicators_data
$name <- as.character(labels$name)
labels
multi_response_odk(data=indicators_data,labelset=labels,id_string="HHAsstOthCBTRecName.")
# A tibble: 7 × 7
value label n per_responses per_respondents N_responses N_respondents
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <int>
1 300 Religious… 525 0.148 0.525 3549 1000
2 999 Other 521 0.147 0.521 3549 1000
3 201 Family 520 0.147 0.52 3549 1000
4 203 Community 501 0.141 0.501 3549 1000
5 204 Community… 497 0.140 0.497 3549 1000
6 202 Friends 495 0.139 0.495 3549 1000
7 100 Government 490 0.138 0.49 3549 1000
multi_response_odk(data=indicators_data,labelset=labels,id_string="HHAsstOthCBTRecName.",retain_empty = FALSE)
# A tibble: 7 × 7
value label n per_responses per_respondents N_responses N_respondents
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <int>
1 300 Religious… 525 0.148 0.525 3549 1000
2 999 Other 521 0.147 0.521 3549 1000
3 201 Family 520 0.147 0.52 3549 1000
4 203 Community 501 0.141 0.501 3549 1000
5 204 Community… 497 0.140 0.497 3549 1000
6 202 Friends 495 0.139 0.495 3549 1000
7 100 Government 490 0.138 0.49 3549 1000
# split it by admin
multi_response_odk(data=indicators_data,labelset=labels,id_string="HHAsstOthCBTRecName.",retain_empty = FALSE,group="ADMIN1Name")
# A tibble: 35 × 8
ADMIN1Name value label n per_responses per_respondents N_responses
<chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 Admin 1 999 Other 119 0.156 0.569 762
2 Admin 1 203 Community 113 0.148 0.541 762
3 Admin 1 204 Community l… 112 0.147 0.536 762
4 Admin 1 300 Religious o… 107 0.140 0.512 762
5 Admin 1 201 Family 106 0.139 0.507 762
6 Admin 1 202 Friends 103 0.135 0.493 762
7 Admin 1 100 Government 102 0.134 0.488 762
8 Admin 2 300 Religious o… 121 0.160 0.555 755
9 Admin 2 202 Friends 111 0.147 0.509 755
10 Admin 2 201 Family 108 0.143 0.495 755
# ℹ 25 more rows
# ℹ 1 more variable: N_respondents <int>
multi_response_odk(data=indicators_data,labelset=labels,multicolumn = "HHAsstOthCBTRecName",retain_empty = FALSE,group="ADMIN1Name",language="English (en)") %>%
select(ADMIN1Name,Crop=label,n,`Percentage of Respondents`=per_respondents) %>%
mutate(`Percentage of Respondents`=scales::percent(`Percentage of Respondents`))
# A tibble: 48 × 4
ADMIN1Name Crop n `Percentage of Respondents`
<chr> <chr> <dbl> <chr>
1 Admin 1 <NA> 762 364.593%
2 Admin 1 <NA> 701 335.407%
3 Admin 1 Other 118 56.459%
4 Admin 1 Community 113 54.067%
5 Admin 1 Community leaders 112 53.589%
6 Admin 1 Religious organization 107 51.196%
7 Admin 1 Family 106 50.718%
8 Admin 1 Friends 103 49.282%
9 Admin 1 Government 102 48.804%
10 Admin 1 <NA> 1 0.478%
# ℹ 38 more rows
multi_response_odk(data=indicators_data,labelset=labels,multicolumn = "HHAsstOthCBTRecName",retain_empty = FALSE) %>%
#adding some lines so that no answer comes last and other selections are in reversing order with most popular at top
mutate(y_axis=reorder(label,n,sum,na.rm=T)) %>%
# mutate(y_axis=relevel(y_axis,ref="No answer")) %>%
#then using geom_col and some standard ggplot functions to make it look a bit prettier
ggplot(aes(y=y_axis,x=per_respondents))+
geom_col(col="black")+
geom_text(aes(label=scales::percent(per_respondents)),hjust=-0.1,size=2)+
scale_x_continuous(labels=scales::percent,limits=c(0,0.5))+
xlab("Percent of Respondents")+
ylab("Response")+
ggtitle("Which crops do you grow?",subtitle="Multiple Responses Allowed")+
theme_light()
Warning in multi_response_odk(data = indicators_data, labelset = labels, :
language English not found. Using first column labelled English (en) instead
Warning: Removed 6 rows containing missing values or values outside the scale range
(`geom_col()`).
Warning: Removed 6 rows containing missing values or values outside the scale range
(`geom_text()`).
multi_response_odk(data=indicators_data,labelset=labels,multicolumn = "HHAsstOthCBTRecName",retain_empty = FALSE,group="FCSCat") %>%
#adding some lines so that no answer comes last and other selections are in reversing order with most popular at top
mutate(y_axis=reorder(label,n,sum,na.rm=T)) %>%
mutate(y_axis=relevel(y_axis,ref="Other")) %>%
#then using geom_col and some standard ggplot functions to make it look a bit prettier
ggplot(aes(y=y_axis,x=per_respondents,fill=FCSCat))+
geom_col(col="black")+
facet_wrap(~FCSCat,nrow=1)+
geom_text(aes(label=scales::percent(per_respondents)),hjust=-0.1,size=2)+
scale_x_continuous(labels=scales::percent,limits=c(0,1))+
xlab("Percent of Respondents")+
ylab("Response")+
ggtitle("Which crops do you grow?",subtitle="Multiple Responses Allowed")+
theme_light()
Warning in multi_response_odk(data = indicators_data, labelset = labels, :
language English not found. Using first column labelled English (en) instead
Warning: Removed 6 rows containing missing values or values outside the scale range
(`geom_col()`).
Warning: Removed 6 rows containing missing values or values outside the scale range
(`geom_text()`).
4.2 Numerical Analysis
4.2.1 Averages
4.2.2 Mean/Medium
4.3 Statistical Significance Tests
4.4 Survey Weights
weighing survey respondents are adjusted to better represent the target population. to do so, the weight given each respondent is adjusted to represent the number of similar respondents in the target population.
weights can be used for different reasons. in most of the cases, we will use