Coverage for src/ifunnel/models/Clustering.py: 0%
53 statements
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-12 09:14 +0000
« prev ^ index » next coverage.py v7.9.2, created at 2025-07-12 09:14 +0000
1import matplotlib.pyplot as plt
2import pandas as pd
3from loguru import logger
4from scipy.cluster.hierarchy import complete, dendrogram, fcluster
5from scipy.spatial.distance import squareform
8def fancy_dendrogram(*args, **kwargs):
9 """
10 FUNCTION TO CREATE DENDROGRAM
11 """
12 max_d = kwargs.pop("max_d", None)
13 if max_d and "color_threshold" not in kwargs:
14 kwargs["color_threshold"] = max_d
15 annotate_above = kwargs.pop("annotate_above", 0)
17 d_data = dendrogram(*args, **kwargs)
19 if not kwargs.get("no_plot", False):
20 plt.title("Hierarchical Clustering Dendrogram (truncated)")
21 plt.xlabel("sample index or (cluster size)")
22 plt.ylabel("distance")
23 for i, d, c in zip(d_data["icoord"], d_data["dcoord"], d_data["color_list"]):
24 x = 0.5 * sum(i[1:3])
25 y = d[1]
26 if y > annotate_above:
27 plt.plot(x, y, "o", c=c)
28 plt.annotate(
29 "%.3g" % y,
30 (x, y),
31 xytext=(0, -5),
32 textcoords="offset points",
33 va="top",
34 ha="center",
35 )
36 if max_d:
37 plt.axhline(y=max_d, c="k")
38 return d_data
41def cluster(data: pd.DataFrame, n_clusters: int, dendrogram: bool = False) -> pd.DataFrame:
42 """
43 FUNCTION TO CLUSTER DATA
44 """
45 logger.info("💡 Running hierarchical clustering method")
47 corr = data.corr(method="spearman") # calculate the correlation
48 distance_corr = 1 - corr # distance based on correlation
50 # Person corr distance matrix
51 con_distance_corr = squareform(distance_corr) # the distance matrix to be able to fit the hierarchical clustering
52 complete_corr = complete(con_distance_corr) # apply hierarchical clustering using the single distance measure
54 if dendrogram:
55 # draw the dendrogram
56 plt.figure(figsize=(25, 10))
57 fancy_dendrogram(
58 complete_corr,
59 leaf_rotation=90.0, # rotates the x-axis labels
60 leaf_font_size=8.0,
61 color_threshold=0.7, # font size for the x-axis labels
62 labels=distance_corr.index,
63 # max_d=0.35,
64 annotate_above=10,
65 )
66 plt.title(
67 "Hierarchical Clustering Dendrogram: Complete Linkage, Spearman Correlation Distance Mearsure",
68 fontsize=16,
69 )
70 plt.xlabel("Assets", fontsize=16)
71 plt.ylabel("Distance", fontsize=16)
72 plt.show()
74 # And now we want to save the clustering into a dataframe.
75 # Create the dataframe
76 cluster_df = pd.DataFrame(index=distance_corr.index)
78 # Save the Complete_Corr clustering into the dataframe with 8 clusters
79 cluster_df["Complete_Corr"] = fcluster(complete_corr, n_clusters, criterion="maxclust")
81 # Column for plotting
82 for index in cluster_df.index:
83 cluster_df.loc[index, "Cluster"] = "Cluster " + str(cluster_df.loc[index, "Complete_Corr"])
85 return cluster_df
88def pick_cluster(data: pd.DataFrame, stat: pd.DataFrame, ml: pd.DataFrame, n_assets: int) -> (list, pd.DataFrame):
89 """
90 METHOD TO PICK ASSETS FROM A CLUSTER BASED ON PERFORMANCE CRITERIA
91 """
92 test = pd.concat([stat, ml], axis=1)
93 # For each cluster find the asset with the highest Sharpe ratio
94 ids = []
95 for clus in test["Cluster"].unique():
96 # number of elements in each cluster
97 max_size = len(test[test["Cluster"] == str(clus)])
98 # Get indexes
99 if n_assets <= max_size:
100 ids.extend(test[test["Cluster"] == str(clus)].nlargest(n_assets, ["Sharpe Ratio"]).index)
101 else:
102 ids.extend(test[test["Cluster"] == str(clus)].nlargest(max_size, ["Sharpe Ratio"]).index)
103 logger.warning(f"⚠️ In {clus} was picked only {max_size} assets")
105 # Get returns
106 result = data[ids]
108 return ids, result