Coverage for src/ifunnel/financial_data_preprocessing/clean_downloaded_data.py: 0%
49 statements
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-12 09:14 +0000
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-12 09:14 +0000
1import os
3import pandas as pd
6def clean_data(data_raw):
7 """
8 Function to clean raw data from Yahoo! finance and transform it into weekly returns
9 """
10 data_raw = data_raw.fillna("")
12 # Delete tickers for which we don't have data for the whole time period
13 for column in data_raw.columns:
14 # if the first three or last three values of the column are not empty, then delete the column
15 if not data_raw[column].values[:3].all() or not data_raw[column].values[-3:].all():
16 data_raw.drop(column, axis=1, inplace=True)
18 # Fill missing daily prices with the closest available price in the future
19 data = data_raw.copy()
20 for asset in data_raw.columns:
21 for indx, date in enumerate(data_raw.index):
22 if not data_raw.loc[date, asset]:
23 for date_future in list(data_raw.index)[indx:]:
24 if data_raw.loc[date_future, asset]:
25 data.loc[date, asset] = data_raw.loc[date_future, asset]
26 print("found price")
27 break
28 else:
29 continue
31 # Delete tickers (outliers) with daily returns bigger that 20%
32 to_delete = []
33 for asset in data.columns:
34 column = list(data[asset])
35 value_old = column[0]
36 for value in column[1:]:
37 if abs((value / value_old) - 1) > 0.20:
38 to_delete.append(asset)
39 print(asset, (value / value_old) - 1, len(to_delete))
40 break
41 else:
42 value_old = value
43 for delete_col in to_delete:
44 data.drop(delete_col, axis=1, inplace=True)
46 # Select only Wednesdays to be able to compute monthly returns
47 data_wed = data[data.index.weekday == 2]
49 # Check if we have all Wednesdays' prices, if not fill it with the price 5 days in the past
50 date_test = data_wed.index[0]
51 date_list = data_wed.index.to_list()
52 while date_test < date_list[-1]:
53 date_test = date_test + pd.Timedelta(days=7)
54 if date_test not in data_wed.index:
55 print(date_test)
56 data_wed.loc[date_test] = data.loc[date_test - pd.Timedelta(days=5)].to_list()
58 # Sort df by index
59 data_wed = data_wed.sort_index()
61 # Create dataframes with returns instead of prices
62 data_wed_rets = data_wed.copy()
63 for asset in data_wed.columns:
64 data_wed_rets[asset] = data_wed[asset].pct_change()
66 # drop the first row, because it contains NaNs
67 data_wed_rets = data_wed_rets.drop(data_wed_rets.index[0])
69 wanted_columns = [col for col in data_wed_rets.columns if col[0] != "nan"]
70 data_wed_rets = data_wed_rets[wanted_columns]
71 # Save results with returns into data folder for the app
72 data_wed_rets.to_parquet(
73 os.path.join(os.path.dirname(os.getcwd()), "financial_data/all_etfs_rets.parquet.gzip"),
74 compression="gzip",
75 )
78if __name__ == "__main__":
79 daily_prices = pd.read_parquet(os.path.join(os.path.dirname(os.getcwd()), "financial_data/daily_price.parquet"))
81 # Select just some indices
82 subset_data = daily_prices[(daily_prices.index > "2013-01-01") & (daily_prices.index < "2024-07-28")]
84 # Clean data and save for the investment funnel app
85 clean_data(data_raw=subset_data)