Coverage for src/jsharpe/sharpe/generators.py: 100%

55 statements  

« prev     ^ index     » next       coverage.py v7.14.3, created at 2026-06-29 13:57 +0000

1"""Synthetic return-data generators and autocorrelation estimation. 

2 

3This module groups the simulation helpers used to generate 

4(autocorrelated) non-Gaussian return series and block-structured random 

5correlation matrices, plus the mean first-order autocorrelation estimator. 

6""" 

7# ruff: noqa: N802, N803, N806, S101 

8 

9import numpy as np 

10import scipy 

11 

12from .linalg import ppoints 

13 

14 

15def generate_autocorrelated_non_gaussian_data( 

16 N: int, 

17 n: int, 

18 SR0: float = 0, 

19 name: str = "gaussian", 

20 rho: float | None = None, 

21 gaussian_autocorrelation: float = 0, 

22) -> np.ndarray: 

23 """Generate autocorrelated non-Gaussian return data for simulation. 

24 

25 Creates a matrix of simulated returns with specified autocorrelation 

26 and marginal distribution characteristics (skewness/kurtosis). 

27 

28 Uses a copula-like approach: 

29 1. Generate AR(1) Gaussian processes 

30 2. Transform to uniform via Gaussian CDF 

31 3. Transform to target marginals via inverse CDF 

32 

33 Args: 

34 N: Number of time periods (rows). 

35 n: Number of assets/strategies (columns). 

36 SR0: Target Sharpe ratio. Default 0. 

37 name: Distribution type. One of "gaussian", "mild", "moderate", 

38 "severe". Default "gaussian". 

39 rho: Autocorrelation coefficient. If None, uses gaussian_autocorrelation. 

40 gaussian_autocorrelation: Autocorrelation for Gaussian case. Default 0. 

41 

42 Returns: 

43 Array of shape (N, n) containing simulated returns. 

44 

45 Example: 

46 >>> np.random.seed(42) 

47 >>> X = generate_autocorrelated_non_gaussian_data(100, 2, SR0=0.1, name="mild") 

48 >>> X.shape 

49 (100, 2) 

50 """ 

51 if rho is None: 

52 # With the distributions we consider the autocorrelation is almost the same. 

53 rho = gaussian_autocorrelation 

54 

55 shape = (N, n) 

56 

57 # Marginal distribution: ppf 

58 R = 10_000 

59 marginal = generate_non_gaussian_data(R, 1, SR0=SR0, name=name)[:, 0] 

60 ppf = scipy.interpolate.interp1d(ppoints(R), sorted(marginal), fill_value="extrapolate") 

61 

62 # AR(1) processes 

63 X = np.random.normal(size=shape) 

64 for i in range(1, shape[0]): 

65 X[i, :] = rho * X[i - 1, :] + np.sqrt(1 - rho**2) * X[i, :] 

66 

67 # Convert the margins to uniform, with the Gaussian cdf 

68 X = scipy.stats.norm.cdf(X) 

69 

70 # Convert the uniforms to the target margins, using the ppf 

71 result: np.ndarray = ppf(X) 

72 

73 return result 

74 

75 

76def get_random_correlation_matrix( 

77 number_of_trials: int = 100, 

78 effective_number_of_trials: int = 10, 

79 number_of_observations: int = 200, 

80 noise: float = 0.1, 

81) -> tuple[np.ndarray, np.ndarray, np.ndarray]: 

82 """Generate a random correlation matrix with block structure. 

83 

84 Creates a correlation matrix representing clustered strategies, where 

85 strategies within the same cluster are highly correlated and strategies 

86 across clusters have lower correlation. 

87 

88 Args: 

89 number_of_trials: Number of time series (strategies). Default 100. 

90 effective_number_of_trials: Number of clusters. Default 10. 

91 number_of_observations: Number of time periods to simulate. Default 200. 

92 noise: Noise level added to each series. Default 0.1. 

93 

94 Returns: 

95 Tuple of (C, X, clusters): 

96 - C: Correlation matrix of shape (number_of_trials, number_of_trials) 

97 - X: Data matrix of shape (number_of_observations, number_of_trials) 

98 - clusters: Cluster assignment for each strategy 

99 

100 Example: 

101 >>> np.random.seed(42) 

102 >>> C, X, clusters = get_random_correlation_matrix( 

103 ... number_of_trials=20, effective_number_of_trials=4 

104 ... ) 

105 >>> C.shape 

106 (20, 20) 

107 >>> np.allclose(np.diag(C), 1) # Diagonal is all ones 

108 True 

109 """ 

110 while True: 

111 block_positions = [ 

112 0, 

113 *sorted(np.random.choice(number_of_trials, effective_number_of_trials - 1, replace=True)), 

114 number_of_trials, 

115 ] 

116 block_sizes = np.diff(block_positions) 

117 if np.all(block_sizes > 0): 

118 break 

119 

120 clusters = np.array([block_number for block_number, size in enumerate(block_sizes) for _ in range(size)]) 

121 X0 = np.random.normal(size=(number_of_observations, effective_number_of_trials)) 

122 X = np.zeros(shape=(number_of_observations, number_of_trials)) 

123 for i, cluster in enumerate(clusters): 

124 X[:, i] = X0[:, cluster] + noise * np.random.normal(size=number_of_observations) 

125 C = np.asarray(np.corrcoef(X, rowvar=False)) 

126 np.fill_diagonal(C, 1) # rounding errors 

127 C = np.clip(C, -1, 1) 

128 return C, X, clusters 

129 

130 

131def generate_non_gaussian_data( 

132 nr: int, 

133 nc: int, 

134 *, 

135 SR0: float = 0, 

136 name: str = "severe", 

137) -> np.ndarray: 

138 """Generate non-Gaussian return data with specified characteristics. 

139 

140 Creates a matrix of simulated returns from a mixture distribution that 

141 exhibits the specified skewness and kurtosis characteristics while 

142 maintaining the target Sharpe ratio. 

143 

144 Args: 

145 nr: Number of rows (observations/time periods). 

146 nc: Number of columns (assets/strategies). 

147 SR0: Target Sharpe ratio. Default 0. 

148 name: Distribution severity. One of: 

149 - "gaussian": No skewness or kurtosis 

150 - "mild": Slight negative skew and excess kurtosis 

151 - "moderate": Moderate negative skew and excess kurtosis 

152 - "severe": Strong negative skew and excess kurtosis 

153 Default "severe". 

154 

155 Returns: 

156 Array of shape (nr, nc) containing simulated returns. 

157 

158 Raises: 

159 AssertionError: If name is not a valid distribution type. 

160 

161 Example: 

162 >>> np.random.seed(42) 

163 >>> X = generate_non_gaussian_data(1000, 1, SR0=0.2, name="mild") 

164 >>> X.shape 

165 (1000, 1) 

166 """ 

167 configs = { 

168 "gaussian": (0, 0, 0.015, 0.010), 

169 "mild": (0.04, -0.03, 0.015, 0.010), 

170 "moderate": (0.03, -0.045, 0.020, 0.010), 

171 "severe": (0.02, -0.060, 0.025, 0.010), 

172 } 

173 assert name in configs 

174 

175 def mixture_variance( 

176 p_tail: float, 

177 mu_tail: float, 

178 sigma_tail: float, 

179 mu_core: float, 

180 sigma_core: float, 

181 ) -> float: 

182 """Compute the variance of a two-component Gaussian mixture. 

183 

184 Args: 

185 p_tail: Mixing weight of the tail component. 

186 mu_tail: Mean of the tail component. 

187 sigma_tail: Standard deviation of the tail component. 

188 mu_core: Mean of the core component. 

189 sigma_core: Standard deviation of the core component. 

190 

191 Returns: 

192 Variance of the mixture distribution. 

193 """ 

194 w = 1.0 - p_tail 

195 mu = w * mu_core + p_tail * mu_tail 

196 m2 = w * (sigma_core**2 + mu_core**2) + p_tail * (sigma_tail**2 + mu_tail**2) 

197 return float(m2 - mu**2) 

198 

199 def gen_with_true_SR0(reps: int, T: int, cfg: tuple[float, float, float, float], SR0: float) -> np.ndarray: 

200 """Generate mixture returns scaled to a target population Sharpe ratio. 

201 

202 Args: 

203 reps: Number of independent return series to generate. 

204 T: Length of each return series. 

205 cfg: Mixture config tuple (p_tail, mu_tail, sigma_tail, sigma_core). 

206 SR0: Target population Sharpe ratio. 

207 

208 Returns: 

209 Array of shape (reps, T) with non-Gaussian returns at the given Sharpe ratio. 

210 """ 

211 p, mu_tail, sig_tail, sig_core = cfg 

212 # Zero-mean baseline mixture (choose mu_core so mean=0) 

213 mu_core0 = -p * mu_tail / (1.0 - p) 

214 std0 = np.sqrt(mixture_variance(p, mu_tail, sig_tail, mu_core0, sig_core)) 

215 mu_shift = SR0 * std0 # sets population Sharpe to SR0, preserves skew/kurt 

216 mask = np.random.uniform(size=(reps, T)) < p 

217 X = np.random.normal(mu_core0 + mu_shift, sig_core, size=(reps, T)) 

218 X[mask] = np.random.normal(mu_tail + mu_shift, sig_tail, size=mask.sum()) 

219 return X 

220 

221 return gen_with_true_SR0(nr, nc, configs[name], SR0) 

222 

223 

224def autocorrelation(X: np.ndarray) -> float: 

225 """Compute mean first-order autocorrelation across columns. 

226 

227 Calculates the lag-1 autocorrelation for each column of the input 

228 matrix and returns the mean across all columns. 

229 

230 Args: 

231 X: Data matrix of shape (n_observations, n_series). 

232 

233 Returns: 

234 Mean autocorrelation coefficient across all columns. 

235 

236 Example: 

237 >>> np.random.seed(42) 

238 >>> # Generate AR(1) process with rho=0.5 

239 >>> n = 1000 

240 >>> X = np.zeros((n, 1)) 

241 >>> X[0] = np.random.normal() 

242 >>> for i in range(1, n): 

243 ... X[i] = 0.5 * X[i-1] + np.sqrt(1-0.25) * np.random.normal() 

244 >>> ac = autocorrelation(X) 

245 >>> bool(0.4 < ac < 0.6) # Should be close to 0.5 

246 True 

247 """ 

248 _nr, nc = X.shape 

249 ac = np.zeros(nc) 

250 for i in range(nc): 

251 ac[i] = np.corrcoef(X[1:, i], X[:-1, i])[0, 1] 

252 return float(ac.mean())