Coverage for src/ifunnel/financial_data_preprocessing/clean_downloaded_data.py: 0%

49 statements  

« prev     ^ index     » next       coverage.py v7.9.2, created at 2025-07-12 09:14 +0000

1import os 

2 

3import pandas as pd 

4 

5 

6def clean_data(data_raw): 

7 """ 

8 Function to clean raw data from Yahoo! finance and transform it into weekly returns 

9 """ 

10 data_raw = data_raw.fillna("") 

11 

12 # Delete tickers for which we don't have data for the whole time period 

13 for column in data_raw.columns: 

14 # if the first three or last three values of the column are not empty, then delete the column 

15 if not data_raw[column].values[:3].all() or not data_raw[column].values[-3:].all(): 

16 data_raw.drop(column, axis=1, inplace=True) 

17 

18 # Fill missing daily prices with the closest available price in the future 

19 data = data_raw.copy() 

20 for asset in data_raw.columns: 

21 for indx, date in enumerate(data_raw.index): 

22 if not data_raw.loc[date, asset]: 

23 for date_future in list(data_raw.index)[indx:]: 

24 if data_raw.loc[date_future, asset]: 

25 data.loc[date, asset] = data_raw.loc[date_future, asset] 

26 print("found price") 

27 break 

28 else: 

29 continue 

30 

31 # Delete tickers (outliers) with daily returns bigger that 20% 

32 to_delete = [] 

33 for asset in data.columns: 

34 column = list(data[asset]) 

35 value_old = column[0] 

36 for value in column[1:]: 

37 if abs((value / value_old) - 1) > 0.20: 

38 to_delete.append(asset) 

39 print(asset, (value / value_old) - 1, len(to_delete)) 

40 break 

41 else: 

42 value_old = value 

43 for delete_col in to_delete: 

44 data.drop(delete_col, axis=1, inplace=True) 

45 

46 # Select only Wednesdays to be able to compute monthly returns 

47 data_wed = data[data.index.weekday == 2] 

48 

49 # Check if we have all Wednesdays' prices, if not fill it with the price 5 days in the past 

50 date_test = data_wed.index[0] 

51 date_list = data_wed.index.to_list() 

52 while date_test < date_list[-1]: 

53 date_test = date_test + pd.Timedelta(days=7) 

54 if date_test not in data_wed.index: 

55 print(date_test) 

56 data_wed.loc[date_test] = data.loc[date_test - pd.Timedelta(days=5)].to_list() 

57 

58 # Sort df by index 

59 data_wed = data_wed.sort_index() 

60 

61 # Create dataframes with returns instead of prices 

62 data_wed_rets = data_wed.copy() 

63 for asset in data_wed.columns: 

64 data_wed_rets[asset] = data_wed[asset].pct_change() 

65 

66 # drop the first row, because it contains NaNs 

67 data_wed_rets = data_wed_rets.drop(data_wed_rets.index[0]) 

68 

69 wanted_columns = [col for col in data_wed_rets.columns if col[0] != "nan"] 

70 data_wed_rets = data_wed_rets[wanted_columns] 

71 # Save results with returns into data folder for the app 

72 data_wed_rets.to_parquet( 

73 os.path.join(os.path.dirname(os.getcwd()), "financial_data/all_etfs_rets.parquet.gzip"), 

74 compression="gzip", 

75 ) 

76 

77 

78if __name__ == "__main__": 

79 daily_prices = pd.read_parquet(os.path.join(os.path.dirname(os.getcwd()), "financial_data/daily_price.parquet")) 

80 

81 # Select just some indices 

82 subset_data = daily_prices[(daily_prices.index > "2013-01-01") & (daily_prices.index < "2024-07-28")] 

83 

84 # Clean data and save for the investment funnel app 

85 clean_data(data_raw=subset_data)