"""
This script is used to combine MLS and TR data and plot the distribution of MLS and TR data in Toronto neighborhoods.
- Time range: Dec 2022
- Excluded inactive listings for very long time
- The data limits to the city of Toronto. Other Greater Toronto Area (GTA) listings are not included.
- You may also directly use the final data: TR_Dec_Update_1month.csv and MLS_nodup.csv under the output folder. For this, please start at line 200.

"""

#%%
# Packages
import os
import pandas as pd
import numpy as np
import re
from datetime import datetime
import geopandas as gpd
from shapely.geometry import Point
import matplotlib.pyplot as plt
from scipy.spatial import cKDTree


# set directory path
script_dir = os.path.dirname(os.path.abspath(__file__))
directory = script_dir  
outputdir = os.path.join(directory, 'output')
rawdatadir = os.path.join(directory, 'raw_data')
os.makedirs(outputdir, exist_ok=True)


neighborhoods = gpd.read_file(os.path.join(rawdatadir, "neighbourhoods_140", "Neighbourhoods - historical 140.shp"))



#%% Import MLS data.
# Generate MLS_nodup dataframe.

dfDec_with_address = pd.read_csv(os.path.join(rawdatadir, 'MLS_Dec_with_location.csv'))
dfExtend_with_address = pd.read_csv(os.path.join(rawdatadir, 'MLS_Hist_with_location.csv'))


# Convert time_on_mkt to timedelta and Date_scrape to datetime
dfExtend_with_address['time_on_mkt'] = pd.to_timedelta(dfExtend_with_address['time_on_mkt'])
dfExtend_with_address['Date_scrape'] = pd.to_datetime(dfExtend_with_address['Date_scrape'])
dfExtend_with_address['time_on_mkt_days'] = dfExtend_with_address['time_on_mkt'].dt.days

dfDec_with_address['time_on_mkt'] = pd.to_timedelta(dfDec_with_address['time_on_mkt'])
if 'Date_scrape' in dfDec_with_address.columns:
    dfDec_with_address['Date_scrape'] = pd.to_datetime(dfDec_with_address['Date_scrape'])
dfDec_with_address['time_on_mkt_days'] = dfDec_with_address['time_on_mkt'].dt.days

# Filter dfExtend_with_address based on time_on_mkt_days and Date_scrape
dfExtend_filtered = dfExtend_with_address[
    (dfExtend_with_address['time_on_mkt_days'] < 7) & 
    (dfExtend_with_address['Date_scrape'] <= pd.Timestamp('2022-12-04'))
].copy()

# Ensure both dataframes have the same columns for concatenation
if 'Date_scrape' not in dfDec_with_address.columns:
    dfDec_with_address['Date_scrape'] = dfDec_with_address['Pulledfrom']
if 'new_scrape' not in dfDec_with_address.columns:
    dfDec_with_address['new_scrape'] = 0
if 'time_on_mkt_days' not in dfDec_with_address.columns:
    dfDec_with_address['time_on_mkt_days'] = dfDec_with_address['time_on_mkt'].dt.days

# Concatenate the two dataframes
MLS_nodup = pd.concat([dfDec_with_address, dfExtend_filtered], ignore_index=True)

# Remove duplicates based on MLS number and Address
MLS_nodup = MLS_nodup.drop_duplicates(subset=['MLS', 'Address'], keep='first')
MLS_nodup = MLS_nodup.drop_duplicates(subset=['MLS','location_lat', 'location_lng'], keep='first')

# Clean up temporary columns
MLS_nodup = MLS_nodup.drop(columns=['time_on_mkt_days'])


# Export the new dataframe
MLS_nodup.to_csv(os.path.join(outputdir, 'MLS_nodup.csv'), index=False)

print(f"MLS_nodup created with {len(MLS_nodup)} rows")
print(f"Original dfDec_with_address: {len(dfDec_with_address)} rows")
print(f"Filtered dfExtend_with_address: {len(dfExtend_filtered)} rows")
print(f"Duplicates removed: {len(dfDec_with_address) + len(dfExtend_filtered) - len(MLS_nodup)} rows")






#%% Toronto Rentals: import data
allrentals_df = []
trdir = os.path.join(rawdatadir, 'torontorentals')
trpattern = r"\d{2}_\d{2}_\d{4}"

### Import and process variables
date_lst = []
for filename in os.listdir(trdir):
    if filename.endswith(".csv"):
        match = re.search(trpattern, filename)
        if match:
            str_date = datetime.strptime(match.group(0), "%m_%d_%Y")  
        if str_date not in date_lst: 
            date_lst.append(str_date)
            # read csv file and append to list
            filepath = os.path.join(trdir, filename)
            df = pd.read_csv(filepath)
            # Get file name as scrape date 
            df["date_scrape"] = str_date  
            df = df.rename(columns={"updated": "date_update", "id": "ID"})
            df["date_update"] = pd.to_datetime(df["date_update"], format='ISO8601', errors='coerce')
            df['units'] = df['units'].apply(eval)
            # Clean variables (faster at this step)
            df = df[df.columns.drop(list(df.filter(regex='photo')))]
            tr_var_lst  = ["ID", "name", "postal_code", "date_update", "date_scrape", "property_type",\
                     "rent_range", "beds_range", "baths_range", "location.lat", "location.lng",\
                     "dimensions_range", "address1", "units"]
            df = df[tr_var_lst]
            # Append to the list of dfs
            allrentals_df.append(df)
## Concatenate all dataframes in the list ---
dfTR_raw = pd.concat(allrentals_df, ignore_index=True)

#%% Clean TR data.
## further clean property types
dfTR_cleaned = dfTR_raw 
dfTR_cleaned['property_type'] = dfTR_raw['property_type'].replace({'basement': 'others', \
                        'duplex': 'town house', 'multi-unit': 'town house', 'loft': 'apartment', \
                        'studio': 'apartment', 'condo':'apartment', 'bachelor':'apartment', \
                        'shared room':'others', 'private room':'others'})
dfTR_cleaned.loc[~dfTR_cleaned['property_type'].isin(['apartment', 'house', 'town house']), 'property_type'] = 'Others'

def extract_bounds(range_str):
    lower, upper = eval(range_str)
    return lower, upper

uneqIdx = {}

for col in ['rent_range', 'beds_range', 'baths_range']:
    # create a new column to store the lower bound
    dfTR_cleaned[col+'_lower'] = dfTR_cleaned[col].apply(lambda x: extract_bounds(x)[0])
    dfTR_cleaned[col+'_upper'] = dfTR_cleaned[col].apply(lambda x: extract_bounds(x)[1])
    
    # find the indices where the upper and lower bounds are not equal
    indices = dfTR_cleaned.index[dfTR_cleaned[col+'_lower']  != dfTR_cleaned[col+'_upper']]
    uneqIdx.update({col: indices})
    # if there are indices with unequal bounds, replace the range string with mean of this range.
    dfTR_cleaned[col] = dfTR_cleaned[[col+'_lower', col+'_upper']].mean(axis=1)   
   

## Time on market ======================
# Create a dictionary to store the first and last date for each rental ID
time_on_mkt = {}
for index, row in dfTR_cleaned.iterrows():
    id = row["ID"]
    date = row["date_scrape"] 
    if id not in time_on_mkt:
        time_on_mkt[id] = {"first_date": date, "last_date": date}
    else:
        if date < time_on_mkt[id]["first_date"]:
            time_on_mkt[id]["first_date"] = date
        if date > time_on_mkt[id]["last_date"]:
            time_on_mkt[id]["last_date"] = date

# Create a new "time_on_mkt" column in the rentals dataframe
dfTR_cleaned["time_on_mkt"] = dfTR_cleaned["ID"].apply(lambda x: time_on_mkt[x]["last_date"] \
                                                     - time_on_mkt[x]["first_date"]).dt.days
dfTR_cleaned["last_date"] = dfTR_cleaned["ID"].apply(lambda x: time_on_mkt[x]["last_date"])
dfTR_cleaned["first_date"] = dfTR_cleaned["ID"].apply(lambda x: time_on_mkt[x]["first_date"])


### Clean address in Toronto Rentals
dfTR_cleaned['postal_code'] = dfTR_cleaned['postal_code'].str.replace(' ', '')

# TR - filter based on date_scrape 
start_date = pd.Timestamp('2022-12-01')
end_date = pd.Timestamp('2022-12-02')
dfTR_Dec_cleaned = dfTR_cleaned.loc[(dfTR_cleaned['date_scrape'] >= start_date) & (dfTR_cleaned['date_scrape'] <= end_date)]

dfTR_Dec_cleaned.to_csv(outputdir +'/TR_Dec_all.csv')

# exclude those inactive for very long time
TR_Dec_Update_1month = dfTR_Dec_cleaned.loc[dfTR_Dec_cleaned['time_on_mkt'] <= 31]

if TR_Dec_Update_1month['date_update'].dtype == 'object':
    TR_Dec_Update_1month['date_update'] = pd.to_datetime(TR_Dec_Update_1month['date_update'], format='ISO8601', errors='coerce')
TR_Dec_Update_1month = TR_Dec_Update_1month[TR_Dec_Update_1month['date_update'] > '2022-10-31']

# Export data
TR_Dec_Update_1month.to_csv(os.path.join(outputdir, 'TR_Dec_Update1month.csv'))

print(f"TR_Dec_Update_1month created with {len(TR_Dec_Update_1month)} rows")
print(f"Original dfTR_Dec_cleaned: {len(dfTR_Dec_cleaned)} rows")
print(f"Filtered TR_Dec_Update_1month: {len(TR_Dec_Update_1month)} rows")
print(f"Excluded rows: {len(dfTR_Dec_cleaned) - len(TR_Dec_Update_1month)}")
print("!! NOTE: This is for the Greater Toronto Area (GTA) !!")




#%% Combine MLS and TR data
# also calculate their distances

TR_Dec_Update_1month = pd.read_csv(outputdir + '/TR_Dec_Update1month.csv')
MLS_nodup = pd.read_csv(outputdir + '/MLS_nodup.csv')


# Filter data within neighborhoods boundaries
def filter_by_neighborhoods(df, lat_col, lng_col):
    """Filter data points within neighborhoods boundaries"""
    if lat_col not in df.columns or lng_col not in df.columns:
        print(f"Warning: {lat_col} or {lng_col} not found in dataframe")
        return df
    
    # Create Point geometry objects
    df['geometry'] = df.apply(lambda row: Point(row[lng_col], row[lat_col]), axis=1)
    
    # Convert to GeoDataFrame
    gdf = gpd.GeoDataFrame(df, geometry='geometry', crs=neighborhoods.crs)
    
    # Perform spatial join
    gdf_with_neighborhoods = gpd.sjoin(gdf, neighborhoods, how='inner', predicate='within')
    
    # Convert back to DataFrame and remove geometry columns
    result = pd.DataFrame(gdf_with_neighborhoods.drop(columns=['geometry', 'index_right']))
    
    print(f"Filtered {len(df)} -> {len(result)} points within neighborhoods")
    return result

# Filter TR data
TR_filtered = filter_by_neighborhoods(TR_Dec_Update_1month, 'location.lat', 'location.lng')
MLS_filtered = filter_by_neighborhoods(MLS_nodup, 'location_lat', 'location_lng')




# Calculate distances from TR points to nearest MLS points
def calculate_distances_to_mls(tr_df, mls_df, tr_lat_col, tr_lng_col, mls_lat_col, mls_lng_col):
    """Calculate distances from each TR point to the nearest MLS point (using KDTree for speed)"""
    
    
    # Extract coordinates as numpy arrays
    tr_lats = tr_df[tr_lat_col].values
    tr_lngs = tr_df[tr_lng_col].values
    mls_lats = mls_df[mls_lat_col].values
    mls_lngs = mls_df[mls_lng_col].values
    
    # Convert lat/lng to approximate Cartesian coordinates 
    toronto_lat = 43.6532  # Toronto's approximate latitude
    lat_scale = 111000  # meters per degree latitude
    lng_scale = 111000 * np.cos(np.radians(toronto_lat))  # meters per degree longitude
    
    # Convert to Cartesian coordinates (in meters)
    tr_x = tr_lngs * lng_scale
    tr_y = tr_lats * lat_scale
    mls_x = mls_lngs * lng_scale
    mls_y = mls_lats * lat_scale
    
    # Stack coordinates
    tr_coords = np.column_stack([tr_x, tr_y])
    mls_coords = np.column_stack([mls_x, mls_y])
    
    # Build KDTree for MLS points
    mls_tree = cKDTree(mls_coords)
    
    # Find nearest MLS point for each TR point
    distances, _ = mls_tree.query(tr_coords, k=1)
    
    return distances

# Calculate distances
print("Checking MLS listings for TR records...")
tr_distances = calculate_distances_to_mls(TR_filtered, MLS_filtered, 
                                         'location.lat', 'location.lng', 
                                         'location_lat', 'location_lng')
print("Done")
TR_filtered['distance_to_mls'] = tr_distances

#%% Plot map
# Create figure
fig, ax = plt.subplots(1, 1, figsize=(15, 12))

# Plot neighborhoods boundaries
neighborhoods.boundary.plot(ax=ax, color='black', linewidth=0.5, alpha=0.7)

# Plot MLS points (beige medium-sized points)
ax.scatter(MLS_filtered['location_lng'], MLS_filtered['location_lat'], 
          c='#F4E4BC', s=50, alpha=0.8, label='MLS', edgecolors='#D4A574', linewidth=0.5)

# Plot TR points
# Gray points for TR within 200m of MLS
tr_close = TR_filtered[TR_filtered['distance_to_mls'] <= 200]
tr_far = TR_filtered[TR_filtered['distance_to_mls'] > 200]

if len(tr_close) > 0:
    ax.scatter(tr_close['location.lng'], tr_close['location.lat'], 
              c='gray', s=20, alpha=0.6, label='TR (included in MLS)')

if len(tr_far) > 0:
    ax.scatter(tr_far['location.lng'], tr_far['location.lat'], 
              c='red', s=10, alpha=0.8, label='TR (not included in MLS)')

# Set figure properties
ax.set_title('TR vs MLS Data Distribution in Toronto Neighborhoods', fontsize=16, fontweight='bold')
ax.set_xlabel('Longitude', fontsize=12)
ax.set_ylabel('Latitude', fontsize=12)
ax.legend(fontsize=10)
ax.grid(True, alpha=0.3)

# Add statistics information
stats_text = f"""
Data Summary:
MLS points: {len(MLS_filtered)}
TR points: {len(TR_filtered)}
TR included in MLS: {len(tr_close)}
TR not included in MLS: {len(tr_far)}
TR missing rate: {len(tr_far) / len(TR_filtered) * 100:.1f}%
"""

ax.text(0.02, 0.98, stats_text, transform=ax.transAxes, fontsize=10,
        verticalalignment='top', bbox=dict(boxstyle='round', facecolor='white', alpha=0.8))

# Save figure
plt.tight_layout()
plt.savefig(outputdir + '/TR_vs_MLS_distribution.png', dpi=300, bbox_inches='tight')
plt.show()

print(f"Plot saved as: {outputdir}/TR_vs_MLS_distribution.png")
print(f"MLS points: {len(MLS_filtered)}")
print(f"TR points: {len(TR_filtered)}")
print(f"TR points ≤200m from MLS: {len(tr_close)}")
print(f"TR points >200m from MLS: {len(tr_far)}")








# %%
