Coverage for src/ifunnel/models/Clustering.py: 0%

1import matplotlib.pyplot as plt

2import pandas as pd

3from loguru import logger

4from scipy.cluster.hierarchy import complete, dendrogram, fcluster

5from scipy.spatial.distance import squareform

8def fancy_dendrogram(*args, **kwargs):

9 """

10 FUNCTION TO CREATE DENDROGRAM

11 """

12 max_d = kwargs.pop("max_d", None)

13 if max_d and "color_threshold" not in kwargs:

14 kwargs["color_threshold"] = max_d

15 annotate_above = kwargs.pop("annotate_above", 0)

17 d_data = dendrogram(*args, **kwargs)

19 if not kwargs.get("no_plot", False):

20 plt.title("Hierarchical Clustering Dendrogram (truncated)")

21 plt.xlabel("sample index or (cluster size)")

22 plt.ylabel("distance")

23 for i, d, c in zip(d_data["icoord"], d_data["dcoord"], d_data["color_list"]):

24 x = 0.5 * sum(i[1:3])

25 y = d[1]

26 if y > annotate_above:

27 plt.plot(x, y, "o", c=c)

28 plt.annotate(

29 "%.3g" % y,

30 (x, y),

31 xytext=(0, -5),

32 textcoords="offset points",

33 va="top",

34 ha="center",

35 )

36 if max_d:

37 plt.axhline(y=max_d, c="k")

38 return d_data

41def cluster(data: pd.DataFrame, n_clusters: int, dendrogram: bool = False) -> pd.DataFrame:

42 """

43 FUNCTION TO CLUSTER DATA

44 """

45 logger.info("💡 Running hierarchical clustering method")

47 corr = data.corr(method="spearman") # calculate the correlation

48 distance_corr = 1 - corr # distance based on correlation

50 # Person corr distance matrix

51 con_distance_corr = squareform(distance_corr) # the distance matrix to be able to fit the hierarchical clustering

52 complete_corr = complete(con_distance_corr) # apply hierarchical clustering using the single distance measure

54 if dendrogram:

55 # draw the dendrogram

56 plt.figure(figsize=(25, 10))

57 fancy_dendrogram(

58 complete_corr,

59 leaf_rotation=90.0, # rotates the x-axis labels

60 leaf_font_size=8.0,

61 color_threshold=0.7, # font size for the x-axis labels

62 labels=distance_corr.index,

63 # max_d=0.35,

64 annotate_above=10,

65 )

66 plt.title(

67 "Hierarchical Clustering Dendrogram: Complete Linkage, Spearman Correlation Distance Mearsure",

68 fontsize=16,

69 )

70 plt.xlabel("Assets", fontsize=16)

71 plt.ylabel("Distance", fontsize=16)

72 plt.show()

74 # And now we want to save the clustering into a dataframe.

75 # Create the dataframe

76 cluster_df = pd.DataFrame(index=distance_corr.index)

78 # Save the Complete_Corr clustering into the dataframe with 8 clusters

79 cluster_df["Complete_Corr"] = fcluster(complete_corr, n_clusters, criterion="maxclust")

81 # Column for plotting

82 for index in cluster_df.index:

83 cluster_df.loc[index, "Cluster"] = "Cluster " + str(cluster_df.loc[index, "Complete_Corr"])

85 return cluster_df

88def pick_cluster(data: pd.DataFrame, stat: pd.DataFrame, ml: pd.DataFrame, n_assets: int) -> (list, pd.DataFrame):

89 """

90 METHOD TO PICK ASSETS FROM A CLUSTER BASED ON PERFORMANCE CRITERIA

91 """

92 test = pd.concat([stat, ml], axis=1)

93 # For each cluster find the asset with the highest Sharpe ratio

94 ids = []

95 for clus in test["Cluster"].unique():

96 # number of elements in each cluster

97 max_size = len(test[test["Cluster"] == str(clus)])

98 # Get indexes

99 if n_assets <= max_size:

100 ids.extend(test[test["Cluster"] == str(clus)].nlargest(n_assets, ["Sharpe Ratio"]).index)

101 else:

102 ids.extend(test[test["Cluster"] == str(clus)].nlargest(max_size, ["Sharpe Ratio"]).index)

103 logger.warning(f"⚠️ In {clus} was picked only {max_size} assets")

104

105 # Get returns

106 result = data[ids]

107

108 return ids, result