// ***************************************************

// * This files cleans and saves the transaction level data.
// * Created by Giselle Labrador
// * Date: 2024-03-15

/***************************************************
// * This files uses transaction level data clean before and the distance to border file.
**************************************************/

cap log close  
log using "$path_results/001_cleaning_transactions.log", replace

clear all
set maxvar 11000

//global path "C:/Fang/PhD/RA work/RA_Lu/Robustness Check_202203"
* cd "$path/LTT_main/transactions"

use "$path_data/GTA_CLEAN_2000_2018.dta", clear

preserve 
	keep if bedrooms == 3
	keep if _n==100
	save "$path_data/GTA_CLEAN_2000_2018_sample.dta", replace
restore

** CLEANING ZIPCODES **
drop if zipcode==""
replace zipcode=upper(zipcode)

gen zipcode2 = subinstr(zipcode," ","",.)
gen len_zipcode2=length(zipcode2)
drop if len_zipcode2!=6
keep if regexm(zipcode2,"^[A-Z][0-9][A-Z][0-9][A-Z][0-9]")
drop zipcode2

// * Create FSA: first three letters of the postal code (zipcode)

gen fsa_from_zip_string =substr(zipcode,1,3)
encode fsa_from_zip_string, gen(fsa_from_zip)


** DATE VARIABLES
gen yearmonth=mofd(date)
format yearmonth %tm
gen yearquart=qofd(date)
format yearquart %tq

// * KEEPING ONLY HOUSES // Note: (survival analysis data has only this three types
// keep if TYPE=="SINGLE FAMILY HOUSE" | TYPE=="TOWNHOUSE" | TYPE=="SEMI DETACHED"

* Time to Move
gen lndom=log(dom)

** HOUSE CHARACTERISTICS (missing garage, #stories, #const type, #sqft), parking
gen X_heating=1*(heat_sourc!="N/R")
gen X_basement=1*(basement1!="N/R" & basement1!="NONE")
gen X_family=1*(family_roo=="Y")
gen X_fire = 1*(fireplace=="Y")
gen X_beds=bedrooms
gen X_baths=washrooms
gen X_kitch=kitchens
gen X_rooms=rooms
gen X_lot=lotdepth*lotfront	

* There are a lot of houses with no information in the data
* This translate in missinx X_Var
* I label houses with all missing X_var 
* I do not delete them as they count as transactions but they will not count in X
egen aux=rsum(X_*)
gen with_X_data=1*(aux!=0)
drop aux

// * Last house characteristic
// gen X_attached=1*(TYPE!="SINGLE FAMILY HOUSE")

** MERGE WITH CENTROID DISTANCE
merge n:1 zipcode using "$path_data/distance_to_border.dta", keepusing(distance)
keep if _m==3
drop _m

** TREATMENT
gen toronto=1*(distance<0)
gen post=1*(yearmonth>ym(2008,02))
gen LTT=toronto*post

** GENERATING +-6 TO DUMMIES
gen TO_3m=1*(toronto==1 & yearmonth==ym(2007,10))
replace TO_3m=2*(toronto==1 & yearmonth==ym(2007,11)) if TO_3m==0
replace TO_3m=3*(toronto==1 & yearmonth==ym(2007,12)) if TO_3m==0
replace TO_3m=4*(toronto==1 & yearmonth==ym(2008,01)) if TO_3m==0
replace TO_3m=5*(toronto==1 & yearmonth==ym(2008,02)) if TO_3m==0
replace TO_3m=6*(toronto==1 & yearmonth==ym(2008,03)) if TO_3m==0

gen x_25=LTT*(distance<-2500 & distance>-5000)
gen LTT_distance=distance*LTT

egen com = group(zipcode)

** SAMPLES **
gen sample3=1*(yearmonth>=ym(2006,01) & yearmonth<=ym(2012,02))
gen sample4=1*(yearmonth>=ym(2006,01) & yearmonth<=ym(2010,02))
gen sample6=1*(yearmonth>=ym(2006,01) & yearmonth<=ym(2017,12)) //ym(2018,02) // * for yearly data

* pre200710 and post200803
// gen pre_200710=yearmonth*(yearmonth<ym(2007,10))
// gen post_200803=yearmonth*(yearmonth<ym(2008,03))

encode TYPE, g(PropertyType)
rename MOS month


* Time to Move
bys homead (date): gen prev_owner_time_to_move=date-date[_n-1]

* Generate LTT*Dummy_distance
egen d_max=max(abs(distance))

// this is old but can remove LTT effect 
// what's the objective of this? to control for differential LTT effect by distance. 
gen d_25=1*(abs(distance)/d_max <= 0.25)
gen d_50=1*(abs(distance)/d_max>0.25 & abs(distance)/d_max <=0.5)
gen d_75=1*(abs(distance)/d_max>0.5 & abs(distance)/d_max <=0.75)
gen d_100=1*(abs(distance)/d_max>0.75)

gen LTT_d25=LTT*d_25
gen LTT_d50=LTT*d_50
gen LTT_d75=LTT*d_75


// * Quarter periods

local minym = 552
local maxym = 732
// * Create period vaiables
forvalues i = `minym'(3)`maxym'{

	local i_1 = `i' +1
	local i_2 = `i' +2
	if `i_2'>`maxym'{
		local i_2 = `maxym'
	}
	if `i_1'>`maxym'{
		local i_1 = `maxym'
	}
	g three_month_per_`i' = 0

	replace three_month_per_`i' = 1 if yearmonth==`i' |yearmonth ==`i_1' |yearmonth ==`i_2'
}

egen quarter = group(three_month_per_*), missing
// invert order
replace quarter = 53-quarter

drop three_month_per_*

// ***************************************************
// * Saving the data
// ***************************************************

save "$path_data/GTA_CLEAN_2000_2018_distance.dta", replace

// ***************************************************

log close