Coverage for src/ifunnel/models/Clustering.py: 0%

53 statements  

« prev     ^ index     » next       coverage.py v7.9.2, created at 2025-07-12 09:14 +0000

1import matplotlib.pyplot as plt 

2import pandas as pd 

3from loguru import logger 

4from scipy.cluster.hierarchy import complete, dendrogram, fcluster 

5from scipy.spatial.distance import squareform 

6 

7 

8def fancy_dendrogram(*args, **kwargs): 

9 """ 

10 FUNCTION TO CREATE DENDROGRAM 

11 """ 

12 max_d = kwargs.pop("max_d", None) 

13 if max_d and "color_threshold" not in kwargs: 

14 kwargs["color_threshold"] = max_d 

15 annotate_above = kwargs.pop("annotate_above", 0) 

16 

17 d_data = dendrogram(*args, **kwargs) 

18 

19 if not kwargs.get("no_plot", False): 

20 plt.title("Hierarchical Clustering Dendrogram (truncated)") 

21 plt.xlabel("sample index or (cluster size)") 

22 plt.ylabel("distance") 

23 for i, d, c in zip(d_data["icoord"], d_data["dcoord"], d_data["color_list"]): 

24 x = 0.5 * sum(i[1:3]) 

25 y = d[1] 

26 if y > annotate_above: 

27 plt.plot(x, y, "o", c=c) 

28 plt.annotate( 

29 "%.3g" % y, 

30 (x, y), 

31 xytext=(0, -5), 

32 textcoords="offset points", 

33 va="top", 

34 ha="center", 

35 ) 

36 if max_d: 

37 plt.axhline(y=max_d, c="k") 

38 return d_data 

39 

40 

41def cluster(data: pd.DataFrame, n_clusters: int, dendrogram: bool = False) -> pd.DataFrame: 

42 """ 

43 FUNCTION TO CLUSTER DATA 

44 """ 

45 logger.info("💡 Running hierarchical clustering method") 

46 

47 corr = data.corr(method="spearman") # calculate the correlation 

48 distance_corr = 1 - corr # distance based on correlation 

49 

50 # Person corr distance matrix 

51 con_distance_corr = squareform(distance_corr) # the distance matrix to be able to fit the hierarchical clustering 

52 complete_corr = complete(con_distance_corr) # apply hierarchical clustering using the single distance measure 

53 

54 if dendrogram: 

55 # draw the dendrogram 

56 plt.figure(figsize=(25, 10)) 

57 fancy_dendrogram( 

58 complete_corr, 

59 leaf_rotation=90.0, # rotates the x-axis labels 

60 leaf_font_size=8.0, 

61 color_threshold=0.7, # font size for the x-axis labels 

62 labels=distance_corr.index, 

63 # max_d=0.35, 

64 annotate_above=10, 

65 ) 

66 plt.title( 

67 "Hierarchical Clustering Dendrogram: Complete Linkage, Spearman Correlation Distance Mearsure", 

68 fontsize=16, 

69 ) 

70 plt.xlabel("Assets", fontsize=16) 

71 plt.ylabel("Distance", fontsize=16) 

72 plt.show() 

73 

74 # And now we want to save the clustering into a dataframe. 

75 # Create the dataframe 

76 cluster_df = pd.DataFrame(index=distance_corr.index) 

77 

78 # Save the Complete_Corr clustering into the dataframe with 8 clusters 

79 cluster_df["Complete_Corr"] = fcluster(complete_corr, n_clusters, criterion="maxclust") 

80 

81 # Column for plotting 

82 for index in cluster_df.index: 

83 cluster_df.loc[index, "Cluster"] = "Cluster " + str(cluster_df.loc[index, "Complete_Corr"]) 

84 

85 return cluster_df 

86 

87 

88def pick_cluster(data: pd.DataFrame, stat: pd.DataFrame, ml: pd.DataFrame, n_assets: int) -> (list, pd.DataFrame): 

89 """ 

90 METHOD TO PICK ASSETS FROM A CLUSTER BASED ON PERFORMANCE CRITERIA 

91 """ 

92 test = pd.concat([stat, ml], axis=1) 

93 # For each cluster find the asset with the highest Sharpe ratio 

94 ids = [] 

95 for clus in test["Cluster"].unique(): 

96 # number of elements in each cluster 

97 max_size = len(test[test["Cluster"] == str(clus)]) 

98 # Get indexes 

99 if n_assets <= max_size: 

100 ids.extend(test[test["Cluster"] == str(clus)].nlargest(n_assets, ["Sharpe Ratio"]).index) 

101 else: 

102 ids.extend(test[test["Cluster"] == str(clus)].nlargest(max_size, ["Sharpe Ratio"]).index) 

103 logger.warning(f"⚠️ In {clus} was picked only {max_size} assets") 

104 

105 # Get returns 

106 result = data[ids] 

107 

108 return ids, result