********************************************************************************
* PROJECT: EMENO Survey  Complex Sampling & Post-Stratification
* FILE: analysis.do
* PURPOSE: Clean, label, and analyze survey data with complex design in Stata
* AUTHOR: Sotiris Roussos
* DATE: 2025-10-31
********************************************************************************

* ==============================================================================
* INITIAL SETUP
* ==============================================================================
cd "C:\Sotiris_Roussos\Biostatistics\\ \\_\Lecture 4\STATA13"

use "analysis.dta", clear
set more off


label drop _all

* ==============================================================================
* VARIABLE LABELS
* ==============================================================================
label variable base_weight        "Sampling weight"
label variable age                "Age (years)"
label variable gender             "Gender"
label variable urban              "Degree of urbanization"
label variable area               "Region"
label variable age_gr             "Age group"
label variable strata             "Stratum"
label variable blockid            "PSU (blockid)"
label variable emenoid            "Identifier (emenoid)"

order emenoid blockid strata urban area base_weight gender age age_gr

* ==============================================================================
* VALUE LABEL DEFINITIONS
* ==============================================================================
label define gender_lbl  0 "Male" 1 "Female" 888 "Unknown" 900 "Don't know" 999 "No answer", replace
label define urban_lbl   1 "Urban" 2 "Semi-urban" 3 "Rural", replace
label define region_lbl  1 "Athens" 2 "Crete" 3 "Thessaloniki" 4 "Thrace" 5 "Thessaly" ///
                         6 "Peloponnese" 7 "Epirus" 8 "Corfu" 9 "Central Greece" ///
                         10 "Macedonia" 11 "Lesvos-Rhodes", replace
label define ilikiaki_lbl 0 "18-29" 1 "30-39" 2 "40-49" 3 "50-59" 4 "60-69" 5 "70+", replace

* ==============================================================================
* ASSIGN VALUE LABELS TO VARIABLES
* ==============================================================================
label values gender          gender_lbl
label values urban           urban_lbl
label values area            region_lbl
label values age_gr          ilikiaki_lbl

replace gender  = . if gender == 888
replace age     = . if age == 888
replace age_gr  = . if age_gr == 888

save "analysis_stata13.dta", replace


* ==============================================================================
* DESCRIPTIVE STATISTICS
* ==============================================================================
desc

codebook base_weight
summarize base_weight
tabstat base_weight, stat(n mean sd p25 p50 p75 min max)
histogram base_weight, percent

* Survey design setup
svyset blockid [pweight = base_weight], strata(strata)
svydescribe

* Sample (unweighted) vs population (weighted) area distributions
tab area
svy: tab area, per obs
graph bar (percent), over(area) name(sample, replace)
svy: proportion area

* Cross-tabulation by gender and age group (weighted)
svy: tab age_gr gender, per obs


* ==============================================================================
* POST-STRATIFICATION
* ==============================================================================
drop if missing(age) | missing(gender)

tab gender
svy: tab gender, per obs
svy: tab age_gr, per obs
svy: tab age_gr gender, row

sort area gender age_gr
merge m:1 area gender age_gr using census2011
*drop if _m == 1

* Adjustment factors
sort area gender age_gr
by area gender age_gr: egen total_we = sum(base_w)
by area gender age_gr: gen adj_factor = freq / total_we
gen weight_fin = base_weight * adj_factor


* Redefine survey design with final weights
svyset blockid [pweight = weight_fin], strata(strata) singleunit(centered)




* ==============================================================================
* ADDITIONAL DATA & LABELS
* ==============================================================================
rename _merge _merge1
merge 1:1 emenoid using "data_exams.dta"

label variable bmi_doct           "Body Mass Index (BMI)"
label variable bmi_doc_cat        "BMI category"
label variable chol               "Total cholesterol (mg/dL)"
label variable ldl                "LDL cholesterol (mg/dL)"
label variable alcohol            "Alcohol intake (units/week)"
label variable diab_prev          "Diabetes"
label variable hypertension_pr    "Hypertension"
label variable hyperchol200       "Hypercholesterolemia (>200 mg/dL)"
label variable hyperldl130        "High LDL (130+ mg/dL)"
label variable PA                 "Physical activity level"
label variable walk210            "Walking <30 min/day"
label variable alc_bin            "Alcohol 7+ units/week"

* ==============================================================================
* VALUE LABELS (NEW VARIABLES)
* ==============================================================================

label define PA_lbl      1 "Low" 2 "Moderate" 3 "High", replace
label define bmi_lbl     1 "Normal weight" 2 "Overweight" 3 "Obese", replace
label define alc_lbl     0 "0-6 units/week" 1 "7+ units/week", replace
label define walk_lbl    0 "<30 min/day" 1 "30+ min/day", replace
label define yesno_lbl   0 "No" 1 "Yes" 888 "Unknown" 900 "Don't know" 999 "No answer", replace

label values PA              PA_lbl
label values bmi_doc_cat     bmi_lbl
label values alc_bin         alc_lbl
label values walk210         walk_lbl
label values diab_prev       yesno_lbl
label values hypertension_pr yesno_lbl
label values hyperchol200    yesno_lbl
label values hyperldl130     yesno_lbl

* ==============================================================================
* PARTICIPATION ANALYSIS
* ==============================================================================
gen participated = (_merge == 3)

label variable participated       "Participation in examination"
label values participated yesno_lbl

save "analysis_all_stata13.dta", replace


. set cformat %6.3f   // coefficients, OR ..
. set sformat %6.3f   // standard errors
. set pformat %4.3f   // p-values

. set showbaselevels on

. logistic participated i.gender i.age_gr i.urban i.area

* ==============================================================================
* SURVEY ESTIMATES
* ==============================================================================
svy: tab gender, per obs
svy: mean age
estat sd
svy: tab urban, per obs
svy: tab PA, per obs
svy: tab bmi_doc_cat, per obs

svy: tab hypertension_pr, per obs ci
svy: tab diab_prev, per obs ci
svy: tab hyperchol200, per obs ci


* ==============================================================================
* LOGISTIC REGRESSIONS (UNIVARIABLE & MULTIVARIABLE)
* ==============================================================================
xi: svy: logit hypertension_pr i.bmi_doc_cat
xi: svy: logit hypertension_pr i.urban
xi: svy: logit hypertension_pr i.gender
xi: svy: logit hypertension_pr alco
xi: svy: logit hypertension_pr age

xi: svy: logit hypertension_pr i.bmi_doc_cat, or
xi: svy: logit hypertension_pr i.bmi_doc_cat i.urban, or
xi: svy: logit hypertension_pr i.bmi_doc_cat i.gender, or
xi: svy: logit hypertension_pr i.bmi_doc_cat age, or


* ==============================================================================
* LDL AND AGE INTERACTIONS
* ==============================================================================
svy: tab hyperldl130, per obs ci
svy: tab hyperldl130 gender, per obs col

svy: logit hyperldl130 gender##c.age i.bmi_doc_cat

lincom _Igender_1 + 20 * _IgenXage_1, eform
lincom _Igender_1 + 35 * _IgenXage_1, eform
lincom _Igender_1 + 50 * _IgenXage_1, eform
lincom _Igender_1 + 70 * _IgenXage_1, eform
lincom _Igender_1 + 85 * _IgenXage_1, eform


* ==============================================================================
* FUNCTIONAL FORM OF AGE
* ==============================================================================
gen age_2 = cut(age), at(18,25,30,35,40,45,50,55,60,65,70,75,80,85,120)

capture drop prob fact
sort age_2
bysort age_2: egen fact = sum(hyperchol200)
bysort age_2: gen prob = fact / _N
gen logitprob = logit(prob)

svy: logit hyperldl age
predict pr_linear, xb

capture drop age2
gen age2 = age^2
gen age12 = age^(1/2)
gen age3 = age^3

svy: logit hyperldl age age2
predict pr_squared, xb

svy: logit hyperldl age age2 age3
predict pr_cubic, xb

rc_spline age
svy: logit hyperldl _S*
predict pr_spline, xb
capture drop _S*

svy: logit hyperldl age age12
predict pr_sqroot, xb

twoway sc pr_linear age, mcolor(red) || ///
       sc logitprob age_2 || ///
       sc pr_squared age, mcolor(blue) || ///
       sc pr_cubic age || ///
       sc pr_spline age || ///
       sc pr_sqroot age, ///
       legend(order(1 "linear" 2 "observed" 3 "squared" 4 "cubic" 5 "spline"))


* ==============================================================================
* FINAL MODELS AND VISUALIZATIONS
* ==============================================================================
xi: svy: logit hyperldl age age2 i.gender i.bmi_doc_cat
xi: svy: logit hyperldl age age2 i.gender*age i.gender*age2 i.bmi_doc_cat

margins gender, at(age = (18(10)100)) atmeans
marginsplot

svy: logit hyperchol200 gender c.bmi_doct##c.age walk210

lincom 10*_b[age] + 20*10*_b[bmiage], eform
lincom 10*_b[age] + 25*10*_b[bmiage], eform
lincom 10*_b[age] + 30*10*_b[bmiage], eform
lincom 10*_b[age] + 35*10*_b[bmiage], eform
lincom 10*_b[age] + 40*10*_b[bmiage], eform

margins, at(bmi_doct = (10(3)100) age = (20(20)100)) atmeans saving(predictions, replace)
marginsplot


* ==============================================================================
* PREDICTED PROBABILITIES  CONTOUR PLOT
* ==============================================================================
use predictions, clear
list _at* _margin in 1/5

rename _at2    bmi_doct
rename _at3    age
rename _margin pr_highchol

twoway contour pr_highchol bmi_doct age
marginsplot

********************************************************************************
* END OF DO-FILE
********************************************************************************
