%prepare dataset for analysis
%the datasets are ready to be used by the analysis part of the codes.
%In particular, row for time and column for items.
%Chao He, April 2017.

%% US data.
%read the raw price level data
disp(' ')
disp('Now start importing US data')
[~,~,rawdata]=xlsread('..\data\NewRaw\US\Section2ALL_xls.xls','20404U Qtr','a8:ia371');
disp(['Raw US data has ' num2str(size(rawdata,1)-2) ' series.'])

%save PCE US.
xlswrite('..\data\PCEUS.xls',rawdata(1:3,1:end));

%if finer item definition exist, discard the coarse.
numFirstChar=cell2mat(cellfun(@(x) (regexp(x,'[a-zA-Z]','once')), rawdata(3:end,2),'un',0)); %find the first character in description.
isIncrease=numFirstChar(2:end)>numFirstChar(1:end-1);%1 if the next item is finer. These items are dominated in fineness
isIncrease(1)=1; %the first item is PCE, always dominated.
isIncrease=[isIncrease; 0]; %the last item always have no finer definition.
RowtoKeep=logical([1;1;~isIncrease]); %keep the first two rows, and the items that are not dominated in finenss
datafinest=rawdata(RowtoKeep,:);
%replace missing value with nan.
datafinest(cell2mat(cellfun(@(x) strcmp(x,'.....'),datafinest,'un',0 )))={nan};
%display information of the sectors.
disp(' ')
disp('The highest level of disaggregation of items:')
datafinest(:,[1 2])
disp(['With highest level of disaggregation, US data has ' num2str(size(datafinest,1)-2) ' series.'])

%convert the data structure to table
tablefinest=cell2table(datafinest(3:end,:));%first convert to a table
quarter=mod(1:size(tablefinest,2)-3,4)';
quarter(quarter==0)=4;
year=1959+floor((1:length(quarter))*0.25-0.01)';
yearquarter=cellstr([repmat('x',length(year),1) num2str(year) repmat('Q',length(year),1) num2str(quarter)])';
tablefinest.Properties.VariableNames=[{'Num','Description','Label'} yearquarter];

%delete empty rows
tablefinest=rmmissing(tablefinest);
disp(' ')
disp(['After removing series with missing values, US data has ' num2str(size(tablefinest,1)) ' series.'])

%calculate inflation.
tablefinest{:,5:end}=100*log(tablefinest{:,5:end}./tablefinest{:,4:end-1});
tablefinest(:,4)=[];

%remove items with more than 20 quarters of zero price change.
iszero=(tablefinest{:,4:end}>-1e-10) & (tablefinest{:,4:end}<1e-10);
totzeros=sum(iszero,2);
RowtoRemove=totzeros>20; %remove series with more than 20 zero inflation.
tablefinest(RowtoRemove,:)=[];
disp(' ')
disp(['After removing series with more than 20 quarters of zero price change, US data has ' num2str(size(tablefinest,1)) ' series.'])

%remove series highly correlated with the other
pai=tablefinest{:,4:end};
corrpai=corr(pai'); %correlation between row items
dpai=pai(:,2:end)-pai(:,1:end-1);
corrdpai=corr(dpai');
highlyCorrelated=tril(abs(corrpai)>0.99 | abs(corrdpai)>0.99,-1);%keep only the lower triangular matrix. If any row has number 1, it is highly correlated with a previous row.
RowtoRemove=any(highlyCorrelated,2);
tablefinest(RowtoRemove,:)=[];
disp(' ')
disp(['After removing highly correlated series, US data has ' num2str(size(tablefinest,1)) ' series.'])

%final output
disp(' ')
disp('In particular, the first 10 rows for US data looks like this')
tablefinest(1:10,:)
disp(' ')
disp('Now start saving the US data')
writetable(tablefinest,'..\data\tableExtractedDataUS.xls')
disp('US data complete!')


%% EU data
% [~,~,rawdata]=xlsread('..\data\NewRaw\EU\prc_hicp_midx_1_Data.csv',1,'a2:i594151');
% rawtable=cell2table(rawdata);
% rawtable=rawtable(:,[1 3 7 9]);
% rawtable.Properties.VariableNames={'TIME','COICOP','GEO','Value'};

%The following is faster
disp(' ')
disp(' ')
disp(' ')
disp(' ')
disp(' ')
disp(' ')
disp('Now start importing EU data');
% read table
rawtable=readtable('..\data\NewRaw\EU\prc_hicp_midx_1_Data.csv');

disp('Now start cleaning EU data');
rawtable=rawtable(:,[1 3 4 7 9]);

%remove missing values
rawtable=standardizeMissing(rawtable,':');
rawtable=rmmissing(rawtable);

%convert strings to numbers
tmp=varfun(@(x) str2double(x), rawtable(:,5));
rawtable.Value=tmp.Fun_Value;

%from long table to wide table
widetable=unstack(rawtable,'Value','TIME');

%separate data by country.
country={'EU','DE','FR','UK','PT'}; %countries. Note EU is expanding over the years.
DataxCountry=cell(length(country),1); %each country's data in a cell.
DataxCountry4=cell(length(country),1); %four digit aggregation level
DataxCountry5=cell(length(country),1); %five digit aggregation level

for i=1:length(country)
    %separate data by countries
    DataxCountry{i}=widetable(strcmp(widetable.GEO,country{i}),:);

    DataxCountry4{i}=rmmissing(DataxCountry{i});
    DataxCountry5{i}=DataxCountry4{i};

    %keep only 4 digits goods, using regular expression
    RowtoKeep=cellfun(@(x) length(regexp(x,'CP\d{4}|(CP00)')), DataxCountry4{i}.COICOP,'un',0);
    DataxCountry4{i}(~cell2mat(RowtoKeep),:)=[];
    %keep only quarterly data, using regular expression
    ColtoKeep=cellfun(@(x) length(regexp(x,'x\d{4}M(0[369]|(12))')), DataxCountry4{i}.Properties.VariableNames,'un',0);
    ColtoDelete=~cell2mat(ColtoKeep);
    ColtoDelete(1:3)=0;
    DataxCountry4{i}(:,ColtoDelete)=[];

    %save HCIP
    try
        writetable(DataxCountry4{i}(1,:),['..\data\HCIP' country{i} '.xls']);
        DataxCountry4{i}(1,:)=[]; %remove the HCIP item
    end

    %calculate inflation.
    DataxCountry4{i}{:,5:end}=100*log(DataxCountry4{i}{:,5:end}./DataxCountry4{i}{:,4:end-1});
    DataxCountry4{i}(:,4)=[];


    %keep only 5 digits goods, using regular expression
    RowtoKeep=cellfun(@(x) length(regexp(x,'CP\d{5}')), DataxCountry5{i}.COICOP,'un',0);
    DataxCountry5{i}(~cell2mat(RowtoKeep),:)=[];
    %keep only quarterly data, using regular expression
    ColtoKeep=cellfun(@(x) length(regexp(x,'x\d{4}M(0[369]|(12))')), DataxCountry5{i}.Properties.VariableNames,'un',0);
    ColtoDelete=~cell2mat(ColtoKeep);
    ColtoDelete(1:3)=0;
    DataxCountry5{i}(:,ColtoDelete)=[];
    %calculate inflation.
    DataxCountry5{i}{:,5:end}=100*log(DataxCountry5{i}{:,5:end}./DataxCountry5{i}{:,4:end-1});
    DataxCountry5{i}(:,4)=[];
end

%display results
disp(' ')
disp('The extracted data for each EU country, with 4-digit and 5-digit items respectively, has size')
DataxCountry4
DataxCountry5
disp(' ')
disp('In particular, the first 10 rows for EU data')
DataxCountry4{1}(1:10,:)
disp(' ')
disp(['EU data has ' num2str(size(DataxCountry4{1},1)) ' series.'])

%remove items with more than 20 quarters of zero price change.
iszero=(DataxCountry4{1}{:,4:end}>-1e-10) & (DataxCountry4{1}{:,4:end}<1e-10);
totzeros=sum(iszero,2);
RowtoRemove=totzeros>20; %remove series with more than 20 zero inflation.
DataxCountry4{1}(RowtoRemove,:)=[];
disp(' ')
disp(['After removing series with more than 20 quarters of zero price change, EU data has ' num2str(size(DataxCountry4{1},1)) ' series.'])

%remove series highly correlated with the other
pai=DataxCountry4{1}{:,4:end};
corrpai=corr(pai'); %correlation between row items
dpai=pai(:,2:end)-pai(:,1:end-1);
corrdpai=corr(dpai');
highlyCorrelated=tril(abs(corrpai)>0.99 | abs(corrdpai)>0.99,-1);%keep only the lower triangular matrix. If any row has number 1, it is highly correlated with a previous row.
RowtoRemove=any(highlyCorrelated,2);
DataxCountry4{1}(RowtoRemove,:)=[];
disp(' ')
disp(['After removing highly correlated series, EU data has ' num2str(size(DataxCountry4{1},1)) ' series.'])


disp(' ')
disp('Now start saving the EU data')
writetable(DataxCountry4{1},'..\data\tableExtractedDataEU.xls')
disp(' ')
disp('EU data complete!')


%% US CPI data
disp(' ')
disp(' ')
disp(' ')
disp(' ')
disp(' ')
disp(' ')
disp('Now start importing US CPI data');
% read table
rawtable=readtable('..\data\NewRaw\USCPI\USCPIRAW.csv','ReadVariableNames',1,'DatetimeType','text');


%keep only quarterly data after 1996, using regular expression
ColtoKeep=cellfun(@(x) length(regexp(x,'((Mar)|(Jun)|(Sep)|(Dec))_((9[6-9])|([01]\d))')), rawtable.Properties.VariableNames,'un',0);
ColtoDelete=~cell2mat(ColtoKeep);
ColtoDelete(1)=0; %the first column is the item code, never delete
ColtoDelete(end-9:end)=1; %the last 9 months data are missing values, always delete.
rawtable(:,ColtoDelete)=[];


% read item names
items=readtable('..\data\NewRaw\USCPI\encoding.csv','ReadVariableNames',1);
rawtable=[items, rawtable];
%save CPI all-items
writetable(rawtable(1,:),'..\data\CPIUS.xls');

%remove the all items related series.
rawtable(1:11,:)=[];


%remove missing values
rawtable=rmmissing(rawtable);

%calculate inflation.
rawtable{:,5:end}=100*log(rawtable{:,5:end}./rawtable{:,4:end-1});
rawtable(:,4)=[];

%disaggregation level, four-character and six-character.
tableExtractedDataUS4CPI=rawtable;
tableExtractedDataUS6CPI=rawtable;
%four-character disaggregation level of items
RowtoKeep=cellfun(@(x) length(x)==4, rawtable{:,1},'un',0);
tableExtractedDataUS4CPI(~cell2mat(RowtoKeep),:)=[];
%six-character disaggregation level of items
RowtoKeep=cellfun(@(x) length(x)==6, rawtable{:,1},'un',0);
tableExtractedDataUS6CPI(~cell2mat(RowtoKeep),:)=[];


%display results
disp(' ')
disp('The extracted CPI data for US, with 4-character items, has size')
size(tableExtractedDataUS4CPI)
disp(' ')
disp('The first 10 rows')
tableExtractedDataUS4CPI(1:10,:)
%remove items with more than 20 quarters of zero price change.
iszero=(tableExtractedDataUS4CPI{:,4:end}>-1e-10) & (tableExtractedDataUS4CPI{:,4:end}<1e-10);
totzeros=sum(iszero,2);
RowtoRemove=totzeros>20; %remove series with more than 20 zero inflation.
tableExtractedDataUS4CPI(RowtoRemove,:)=[];
disp(' ')
disp(['After removing series with more than 20 quarters of zero price change, US data has ' num2str(size(tableExtractedDataUS4CPI,1)) ' series.'])

%remove series highly correlated with the other
pai=tableExtractedDataUS4CPI{:,4:end};
corrpai=corr(pai'); %correlation between row items
dpai=pai(:,2:end)-pai(:,1:end-1);
corrdpai=corr(dpai');
highlyCorrelated=tril(abs(corrpai)>0.99 | abs(corrdpai)>0.99,-1);%keep only the lower triangular matrix. If any row has number 1, it is highly correlated with a previous row.
RowtoRemove=any(highlyCorrelated,2);
tableExtractedDataUS4CPI(RowtoRemove,:)=[];
disp(' ')
disp(['After removing highly correlated series, US data has ' num2str(size(tableExtractedDataUS4CPI,1)) ' series.'])


disp(' ')
disp('Now start saving the 4-character US data')
writetable(tableExtractedDataUS4CPI,'..\data\tableExtractedDataUS4CPI.xls')




%display results
disp(' ')
disp('The extracted CPI data for US, with 6-character items, has size')
size(tableExtractedDataUS6CPI)
disp(' ')
disp('The first 10 rows')
tableExtractedDataUS6CPI(1:10,:)
%remove items with more than 20 quarters of zero price change.
iszero=(tableExtractedDataUS6CPI{:,4:end}>-1e-10) & (tableExtractedDataUS6CPI{:,4:end}<1e-10);
totzeros=sum(iszero,2);
RowtoRemove=totzeros>20; %remove series with more than 20 zero inflation.
tableExtractedDataUS6CPI(RowtoRemove,:)=[];
disp(' ')
disp(['After removing series with more than 20 quarters of zero price change, US data has ' num2str(size(tableExtractedDataUS6CPI,1)) ' series.'])

%remove series highly correlated with the other
pai=tableExtractedDataUS6CPI{:,4:end};
corrpai=corr(pai'); %correlation between row items
dpai=pai(:,2:end)-pai(:,1:end-1);
corrdpai=corr(dpai');
highlyCorrelated=tril(abs(corrpai)>0.99 | abs(corrdpai)>0.99,-1);%keep only the lower triangular matrix. If any row has number 1, it is highly correlated with a previous row.
RowtoRemove=any(highlyCorrelated,2);
tableExtractedDataUS6CPI(RowtoRemove,:)=[];
disp(' ')
disp(['After removing highly correlated series, US data has ' num2str(size(tableExtractedDataUS6CPI,1)) ' series.'])


disp(' ')
disp('Now start saving the 6-character US data')
writetable(tableExtractedDataUS6CPI,'..\data\tableExtractedDataUS6CPI.xls')


disp(' ')
disp('US CPI data complete!')
