*============================================
*  prepares the PSID sample for estimation
*============================================

cd "$output"

use "$PSID14\psid_cons.dta", clear
merge m:1 year using "$MD/price_indices" 
drop if _m==2
drop _m

egen fin_w=rsum(cash stock bond penval)
keep 	id year person weduc_ind educ_ind age agew seo y fsize sex kids marit race1 wrace1 smsa hours hourw educ weduc empst wempst ///
		ly wly age_youngest nokids ndcons services totcons totcons_nh tot_assets fchg asset fin_w p_totcons childcare tuition
replace year=1900+year
sort id year

drop if agew==0|agew>100		/* Keep couples */ 

* prepare variables
g ybw=year-agew

gen sc=1 if weduc>=0  & weduc<=11           	/* 0-11  grades*/
replace sc=2 if weduc==12                       /* High school or 12 grades+nonacademic training*/
replace sc=3 if weduc>=13 & weduc<=17          	/* College dropout, BA degree, or collage & adv./prof. degree*/
replace sc=. if weduc>17                    	/* Missing, NA, DK*/
ren weduc wyearschl
ren sc weduc

gen sc=1 if educ>=0  & educ<=11           	
replace sc=2 if educ==12                    
replace sc=3 if educ>=13 & educ<=17         
replace sc=. if educ>17                    	
ren educ yearschl
ren sc educ

gen logc_nh=log((100/p_totcons)*totcons_nh/sqrt(fsize))
gen logc_wh=log((100/p_totcons)*totcons/sqrt(fsize))

* do not use wages that are below 0.5 of min wage
g 		MW=4.75 if year==1996
replace	MW=5.15 if year<=2006 & year>1996
replace MW=5.85 if year==2007
replace MW=6.55 if year==2008
replace MW=7.25 if year>=2009

gen wg_h=(ly/hours)
gen wg_w=(wly/hourw)
replace wg_h=. if wg_h<0.5*MW
replace wg_w=. if wg_w<0.5*MW

gen logwg_h =log((100/p_totcons)*wg_h)
gen logwg_w =log((100/p_totcons)*wg_w)

gen logUY=log(1+y-ly-wly)
gen noUY = logUY==0
gen two_ms_fa=(fin_w/y)>(2/12)

gen logL_w = log(4160-hourw) 
gen logL = log(4160-hours) 

gen log_y =log((100/p_totcons)*(ly + wly))

cap drop ratio1
gen ratio1 = (exp(logwg_w)*exp(logL_w))/(exp(logwg_h)*exp(logL))

foreach var of varlist logwg_w logc_wh logc_nh logUY logL logL_w ratio1 log_y logwg_h {
	gen temp=`var'
	su `var', de 
	replace temp = r(p99) if `var'>=r(p99) & `var'!=.
	replace temp = r(p1) if `var'<=r(p1) 
	replace `var'=temp
	drop temp
}

tab educ, gen(edd)
tab weduc, gen(wedd)

g WCOH=.
replace WCOH=1 if ybw<=1945
replace WCOH=2 if ybw>=1945 & ybw<=1949
replace WCOH=3 if ybw>=1950 & ybw<=1954
replace WCOH=4 if ybw>=1955 & ybw<=1959
replace WCOH=5 if ybw>=1960 & ybw<=1964
replace WCOH=6 if ybw>=1965 & ybw<=1969
replace WCOH=7 if ybw>=1970 & ybw<=1974
replace WCOH=8 if ybw>=1975 

g COH=.
replace COH=1 if yb<=1945
replace COH=2 if yb>=1945 & yb<=1949
replace COH=3 if yb>=1950 & yb<=1954
replace COH=4 if yb>=1955 & yb<=1959
replace COH=5 if yb>=1960 & yb<=1964
replace COH=6 if yb>=1965 & yb<=1969
replace COH=7 if yb>=1970 & yb<=1974
replace COH=8 if yb>=1975 

egen COHXeduc   = group(COH educ)
egen WCOHXweduc = group(WCOH weduc)

tab COH, gen(Cd)
tab WCOH, gen(WCd)

forvalues e=1/2 {
	forvalues c=1/7 {
		gen wCEd`e'`c' = weduc==`e' &  WCOH==`c'
	}
}

* Sample 
keep if agew>=25 & agew<=65		/* Keep wives 25 to 65 */
keep if marit==1				/* Keep couples */
save data_PSID_for_moments, replace
keep if age_youngest>10			/* only no young kids */ 

* Sample selection correction 
g both_work=logwg_w!=. & logwg_h!=. & hours>=80 & hourw>=80	/* use the accurate measure of annual hours to require at least 2 weeks of annual work */
probit both_work logUY noUY two_ms_fa edd* wedd* yearschl wyearschl
predict xb,xb
g mills=normalden(xb)/normal(xb)
keep if logwg_w!=. & logwg_h!=.

* Prepare lagged instruments for the appendix table
tsset person year
foreach var of varlist logL logL_w logwg_w logwg_h logc_wh { 
	gen l2_`var'   = l2.`var'
	gen d_`var'    = `var'-l2.`var'
	gen l2_d_`var' = l2.d_`var'
}

keep if hourw<60*52 & hours<60*52	/* Working less than 60 a week */

save data_PSID, replace

cd "$MD"
