Coverage for src/jsharpe/sharpe/generators.py: 100%

1"""Synthetic return-data generators and autocorrelation estimation.

3This module groups the simulation helpers used to generate

4(autocorrelated) non-Gaussian return series and block-structured random

5correlation matrices, plus the mean first-order autocorrelation estimator.

6"""

7# ruff: noqa: N802, N803, N806, S101

9import numpy as np

10import scipy

12from .linalg import ppoints

15def generate_autocorrelated_non_gaussian_data(

16 N: int,

17 n: int,

18 SR0: float = 0,

19 name: str = "gaussian",

20 rho: float | None = None,

21 gaussian_autocorrelation: float = 0,

22) -> np.ndarray:

23 """Generate autocorrelated non-Gaussian return data for simulation.

25 Creates a matrix of simulated returns with specified autocorrelation

26 and marginal distribution characteristics (skewness/kurtosis).

28 Uses a copula-like approach:

29 1. Generate AR(1) Gaussian processes

30 2. Transform to uniform via Gaussian CDF

31 3. Transform to target marginals via inverse CDF

33 Args:

34 N: Number of time periods (rows).

35 n: Number of assets/strategies (columns).

36 SR0: Target Sharpe ratio. Default 0.

37 name: Distribution type. One of "gaussian", "mild", "moderate",

38 "severe". Default "gaussian".

39 rho: Autocorrelation coefficient. If None, uses gaussian_autocorrelation.

40 gaussian_autocorrelation: Autocorrelation for Gaussian case. Default 0.

42 Returns:

43 Array of shape (N, n) containing simulated returns.

45 Example:

46 >>> np.random.seed(42)

47 >>> X = generate_autocorrelated_non_gaussian_data(100, 2, SR0=0.1, name="mild")

48 >>> X.shape

49 (100, 2)

50 """

51 if rho is None:

52 # With the distributions we consider the autocorrelation is almost the same.

53 rho = gaussian_autocorrelation

55 shape = (N, n)

57 # Marginal distribution: ppf

58 R = 10_000

59 marginal = generate_non_gaussian_data(R, 1, SR0=SR0, name=name)[:, 0]

60 ppf = scipy.interpolate.interp1d(ppoints(R), sorted(marginal), fill_value="extrapolate")

62 # AR(1) processes

63 X = np.random.normal(size=shape)

64 for i in range(1, shape[0]):

65 X[i, :] = rho * X[i - 1, :] + np.sqrt(1 - rho**2) * X[i, :]

67 # Convert the margins to uniform, with the Gaussian cdf

68 X = scipy.stats.norm.cdf(X)

70 # Convert the uniforms to the target margins, using the ppf

71 result: np.ndarray = ppf(X)

73 return result

76def get_random_correlation_matrix(

77 number_of_trials: int = 100,

78 effective_number_of_trials: int = 10,

79 number_of_observations: int = 200,

80 noise: float = 0.1,

81) -> tuple[np.ndarray, np.ndarray, np.ndarray]:

82 """Generate a random correlation matrix with block structure.

84 Creates a correlation matrix representing clustered strategies, where

85 strategies within the same cluster are highly correlated and strategies

86 across clusters have lower correlation.

88 Args:

89 number_of_trials: Number of time series (strategies). Default 100.

90 effective_number_of_trials: Number of clusters. Default 10.

91 number_of_observations: Number of time periods to simulate. Default 200.

92 noise: Noise level added to each series. Default 0.1.

94 Returns:

95 Tuple of (C, X, clusters):

96 - C: Correlation matrix of shape (number_of_trials, number_of_trials)

97 - X: Data matrix of shape (number_of_observations, number_of_trials)

98 - clusters: Cluster assignment for each strategy

100 Example:

101 >>> np.random.seed(42)

102 >>> C, X, clusters = get_random_correlation_matrix(

103 ... number_of_trials=20, effective_number_of_trials=4

104 ... )

105 >>> C.shape

106 (20, 20)

107 >>> np.allclose(np.diag(C), 1) # Diagonal is all ones

108 True

109 """

110 while True:

111 block_positions = [

112 0,

113 *sorted(np.random.choice(number_of_trials, effective_number_of_trials - 1, replace=True)),

114 number_of_trials,

115 ]

116 block_sizes = np.diff(block_positions)

117 if np.all(block_sizes > 0):

118 break

119

120 clusters = np.array([block_number for block_number, size in enumerate(block_sizes) for _ in range(size)])

121 X0 = np.random.normal(size=(number_of_observations, effective_number_of_trials))

122 X = np.zeros(shape=(number_of_observations, number_of_trials))

123 for i, cluster in enumerate(clusters):

124 X[:, i] = X0[:, cluster] + noise * np.random.normal(size=number_of_observations)

125 C = np.asarray(np.corrcoef(X, rowvar=False))

126 np.fill_diagonal(C, 1) # rounding errors

127 C = np.clip(C, -1, 1)

128 return C, X, clusters

129

130

131def generate_non_gaussian_data(

132 nr: int,

133 nc: int,

134 *,

135 SR0: float = 0,

136 name: str = "severe",

137) -> np.ndarray:

138 """Generate non-Gaussian return data with specified characteristics.

139

140 Creates a matrix of simulated returns from a mixture distribution that

141 exhibits the specified skewness and kurtosis characteristics while

142 maintaining the target Sharpe ratio.

143

144 Args:

145 nr: Number of rows (observations/time periods).

146 nc: Number of columns (assets/strategies).

147 SR0: Target Sharpe ratio. Default 0.

148 name: Distribution severity. One of:

149 - "gaussian": No skewness or kurtosis

150 - "mild": Slight negative skew and excess kurtosis

151 - "moderate": Moderate negative skew and excess kurtosis

152 - "severe": Strong negative skew and excess kurtosis

153 Default "severe".

154

155 Returns:

156 Array of shape (nr, nc) containing simulated returns.

157

158 Raises:

159 AssertionError: If name is not a valid distribution type.

160

161 Example:

162 >>> np.random.seed(42)

163 >>> X = generate_non_gaussian_data(1000, 1, SR0=0.2, name="mild")

164 >>> X.shape

165 (1000, 1)

166 """

167 configs = {

168 "gaussian": (0, 0, 0.015, 0.010),

169 "mild": (0.04, -0.03, 0.015, 0.010),

170 "moderate": (0.03, -0.045, 0.020, 0.010),

171 "severe": (0.02, -0.060, 0.025, 0.010),

172 }

173 assert name in configs

174

175 def mixture_variance(

176 p_tail: float,

177 mu_tail: float,

178 sigma_tail: float,

179 mu_core: float,

180 sigma_core: float,

181 ) -> float:

182 """Compute the variance of a two-component Gaussian mixture.

183

184 Args:

185 p_tail: Mixing weight of the tail component.

186 mu_tail: Mean of the tail component.

187 sigma_tail: Standard deviation of the tail component.

188 mu_core: Mean of the core component.

189 sigma_core: Standard deviation of the core component.

190

191 Returns:

192 Variance of the mixture distribution.

193 """

194 w = 1.0 - p_tail

195 mu = w * mu_core + p_tail * mu_tail

196 m2 = w * (sigma_core**2 + mu_core**2) + p_tail * (sigma_tail**2 + mu_tail**2)

197 return float(m2 - mu**2)

198

199 def gen_with_true_SR0(reps: int, T: int, cfg: tuple[float, float, float, float], SR0: float) -> np.ndarray:

200 """Generate mixture returns scaled to a target population Sharpe ratio.

201

202 Args:

203 reps: Number of independent return series to generate.

204 T: Length of each return series.

205 cfg: Mixture config tuple (p_tail, mu_tail, sigma_tail, sigma_core).

206 SR0: Target population Sharpe ratio.

207

208 Returns:

209 Array of shape (reps, T) with non-Gaussian returns at the given Sharpe ratio.

210 """

211 p, mu_tail, sig_tail, sig_core = cfg

212 # Zero-mean baseline mixture (choose mu_core so mean=0)

213 mu_core0 = -p * mu_tail / (1.0 - p)

214 std0 = np.sqrt(mixture_variance(p, mu_tail, sig_tail, mu_core0, sig_core))

215 mu_shift = SR0 * std0 # sets population Sharpe to SR0, preserves skew/kurt

216 mask = np.random.uniform(size=(reps, T)) < p

217 X = np.random.normal(mu_core0 + mu_shift, sig_core, size=(reps, T))

218 X[mask] = np.random.normal(mu_tail + mu_shift, sig_tail, size=mask.sum())

219 return X

220

221 return gen_with_true_SR0(nr, nc, configs[name], SR0)

222

223

224def autocorrelation(X: np.ndarray) -> float:

225 """Compute mean first-order autocorrelation across columns.

226

227 Calculates the lag-1 autocorrelation for each column of the input

228 matrix and returns the mean across all columns.

229

230 Args:

231 X: Data matrix of shape (n_observations, n_series).

232

233 Returns:

234 Mean autocorrelation coefficient across all columns.

235

236 Example:

237 >>> np.random.seed(42)

238 >>> # Generate AR(1) process with rho=0.5

239 >>> n = 1000

240 >>> X = np.zeros((n, 1))

241 >>> X[0] = np.random.normal()

242 >>> for i in range(1, n):

243 ... X[i] = 0.5 * X[i-1] + np.sqrt(1-0.25) * np.random.normal()

244 >>> ac = autocorrelation(X)

245 >>> bool(0.4 < ac < 0.6) # Should be close to 0.5

246 True

247 """

248 _nr, nc = X.shape

249 ac = np.zeros(nc)

250 for i in range(nc):

251 ac[i] = np.corrcoef(X[1:, i], X[:-1, i])[0, 1]

252 return float(ac.mean())