-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathcollect_data.py
More file actions
executable file
·110 lines (87 loc) · 4.88 KB
/
collect_data.py
File metadata and controls
executable file
·110 lines (87 loc) · 4.88 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
from io import BytesIO
from urllib.request import urlopen
from zipfile import ZipFile
import pandas as pd
import os
import numpy as np
import sqlalchemy
from sqlalchemy import create_engine
import datapungi_fed as dpf
def myengine():
return create_engine(open("pgres_url.txt", "r").read())
gb_all_url = 'https://www.philadelphiafed.org/-/media/frbp/assets/surveys-and-data/greenbook-data/gbweb/gbweb_all_column_format.zip?la=en&hash=22851EFA1EF12BDB30474720752BB409'
with urlopen(gb_all_url) as zipresp:
with ZipFile(BytesIO(zipresp.read())) as zfile:
zfile.extractall('/tmp/greenbooks')
def read_gb(fname):
# Reads in the greenbook forecast data and reshapes to long format
# Day is calculated weird to avoid setting column to scallar when there is no data in the excel file
df = pd.DataFrame(pd.read_excel(fname, index_col='Date').stack()).reset_index()
df.columns = ['valuedate', 'var_fdate', 'value']
df.loc[:, 'variable'] = df.loc[:, 'var_fdate'].apply(lambda x: x[0:-9])
df.loc[:, 'forecastdate'] = pd.to_datetime(df.loc[:, 'var_fdate'].apply(lambda x: x[-8:]), format='%Y%m%d')
df.loc[:, 'year'] = np.floor(df.loc[:, 'valuedate'])
df.loc[:, 'month'] = 3 * np.round(10*(df.loc[:, 'valuedate'] - np.floor(df.loc[:, 'valuedate'])))
df.loc[:, 'day'] = np.divide(np.floor(df.loc[:, 'valuedate']), np.floor(df.loc[:, 'valuedate']))
df.loc[:, 'valuedate'] = pd.to_datetime(df.loc[:, ['year', 'month', 'day']]) + pd.tseries.offsets.QuarterEnd()
return df.loc[:, ['variable', 'forecastdate', 'valuedate', 'value']]
df = pd.concat([read_gb(fname) for fname in os.scandir('/tmp/greenbooks')], ignore_index=True)
df.to_sql("gb_forecasts", myengine(), if_exists='replace', index=False, method='multi',
dtype={"forecastdate": sqlalchemy.Date(), "valuedate": sqlalchemy.Date()})
def read_gb_outgap(fname):
df = pd.DataFrame(pd.read_excel(fname, index_col=0).stack()).reset_index()
df.columns = ['valuedate', 'var_fdate', 'value']
df.loc[:, 'variable'] = df.loc[:, 'var_fdate'].apply(lambda x: x[0:-7])
df.loc[:, 'forecastdate'] = pd.to_datetime(df.loc[:, 'var_fdate'].apply(lambda x: x[-6:]), format='%y%m%d')
df.loc[:, 'year'] = pd.to_numeric(df.loc[:, 'valuedate'].apply(lambda x: x[0:4]))
df.loc[:, 'month'] = 3*pd.to_numeric(df.loc[:, 'valuedate'].apply(lambda x: x[5:7]))
df.loc[:, 'day'] = 1
df.loc[:, 'valuedate'] = pd.to_datetime(df.loc[:, ['year', 'month', 'day']]) + pd.tseries.offsets.QuarterEnd()
return df.loc[:, ['variable', 'forecastdate', 'valuedate', 'value']]
def read_ebp(fname):
df = pd.read_csv(fname, parse_dates=['date'])
df.columns = ['datem', 'gz_spread', 'gz_premium', 'gz_default_prob']
df.set_index('datem', inplace=True)
return df
def read_bbk_mgdp():
df = pd.read_excel('https://www.chicagofed.org/~/media/publications/bbki/bbki-data-series-xlsx.xlsx', sheet_name='monthly_data').loc[:, ['Date', 'MGDP']]
df.columns = ['datem', 'bbk_mgdp']
df.set_index('datem', inplace=True)
return df
def read_wrds_csv():
"""
loops through all the files in wrdsdata and reads them in
These files should be CSVs pulled from wrds. The date format should be monthly MMDDYY10 (e.g. 07/25/1984)
"""
filepaths = [f'wrdsdata/{f}' for f in os.listdir("wrdsdata") if f.endswith('.csv')]
df = pd.concat(map(lambda x: pd.read_csv(x, parse_dates=['caldt'], index_col='caldt'), filepaths), axis=1)
df.index.rename('datem', inplace=True)
df.index = df.index + pd.offsets.MonthBegin(-1)
return df
def pull_fred_data(series):
"""
Pull data from fred
"""
fred = dpf.data("5240bbe3851ef2d1aaffd0877d6048dd")
datas = [fred.series(s, frequency='m') for s in series]
df = pd.concat(datas, axis=1)
df.index.rename('datem', inplace=True)
return df
def main():
fname = 'https://www.philadelphiafed.org/-/media/frbp/assets/surveys-and-data/greenbook-data/greenbook_output_gap_dh_web.xlsx?la=en&hash=FFA675CD9C77F04E3F2BAA2D5657276D'
read_gb_outgap(fname).to_sql("gb_forecasts", myengine(), if_exists='append', index=False,
method='multi', dtype={"forecastdate": sqlalchemy.Date(), "valuedate": sqlalchemy.Date()})
fseries = ['fedfunds', 'indpro', 'cpiaucsl', 'unrate', 'dgs2', 'dgs10', 'dgs1']
df = pull_fred_data(fseries)
df = df.join(read_wrds_csv(), how='outer')
fname = 'https://www.federalreserve.gov/econresdata/notes/feds-notes/2016/files/ebp_csv.csv'
df = df.join(read_ebp(fname), how='outer')
df = df.join(read_bbk_mgdp(), how='outer')
dflong = pd.melt(df, ignore_index=False).dropna()
dflong.set_index('variable', append=True, inplace=True)
dflong.sort_index(inplace=True)
print("Replacing macro_data SQL table!")
dflong.to_sql('macro_data', myengine(), if_exists='replace', index=True,
method='multi', dtype={"datem": sqlalchemy.Date()})
if __name__ == "__main__":
main()