Coverage for src / minibook / main.py: 100%

178 statements  

« prev     ^ index     » next       coverage.py v7.13.2, created at 2026-01-27 10:03 +0000

1"""MiniBook - A tool to create a webpage from a list of links. 

2 

3Generates a clean, responsive HTML webpage using Jinja2 templates. 

4""" 

5 

6import json 

7import secrets 

8import sys 

9import time 

10from os import getenv 

11from pathlib import Path 

12from typing import NamedTuple 

13from urllib.parse import urlparse 

14 

15import requests 

16import typer 

17 

18from minibook.utils import get_timestamp, load_template 

19 

20# HTTP status codes 

21HTTP_BAD_REQUEST = 400 

22 

23# Minimum elements in a list-formatted link 

24MIN_LINK_ELEMENTS = 2 

25 

26# Minimum parts in a domain name 

27MIN_DOMAIN_PARTS = 2 

28 

29 

30class GenerationParams(NamedTuple): 

31 """Parameters for minibook generation.""" 

32 

33 output_format: str 

34 title: str 

35 link_tuples: list[tuple[str, str]] 

36 subtitle: str | None 

37 output: str 

38 template: str | None 

39 

40 

41def validate_url_format(url: str) -> tuple[bool, str | None]: 

42 """Validate URL format and scheme. 

43 

44 Checks that the URL is a non-empty string with http or https scheme, or a relative path. 

45 Blocks potentially dangerous schemes like javascript:, data:, and file:. 

46 

47 Args: 

48 url: The URL string to validate. 

49 

50 Returns: 

51 A tuple of (is_valid, error_message). error_message is None if valid. 

52 

53 Examples: 

54 Valid HTTP and HTTPS URLs return (True, None): 

55 

56 >>> validate_url_format("https://example.com") 

57 (True, None) 

58 >>> validate_url_format("http://example.com/path/to/page") 

59 (True, None) 

60 >>> validate_url_format("https://example.com?query=value&foo=bar") 

61 (True, None) 

62 

63 Relative paths are allowed for local file references: 

64 

65 >>> validate_url_format("./tests/html-report/report.html") 

66 (True, None) 

67 >>> validate_url_format("../docs/index.html") 

68 (True, None) 

69 >>> validate_url_format("path/to/file.html") 

70 (True, None) 

71 

72 Note: Bare filenames without path separators or ./ prefix may be rejected 

73 if they look like domain names (contain dots). Use explicit path notation: 

74 

75 >>> validate_url_format("./file.tar.gz") 

76 (True, None) 

77 

78 JavaScript URLs are rejected to prevent XSS attacks: 

79 

80 >>> validate_url_format("javascript:alert(1)") 

81 (False, "Invalid URL scheme 'javascript': blocked for security") 

82 

83 Data URLs are rejected to prevent code injection: 

84 

85 >>> validate_url_format("data:text/html,<script>alert(1)</script>") 

86 (False, "Invalid URL scheme 'data': blocked for security") 

87 

88 File URLs are rejected to prevent local file access: 

89 

90 >>> validate_url_format("file:///etc/passwd") 

91 (False, "Invalid URL scheme 'file': blocked for security") 

92 

93 Empty strings and whitespace-only strings are rejected: 

94 

95 >>> validate_url_format("") 

96 (False, 'URL must be a non-empty string') 

97 >>> validate_url_format(" ") 

98 (False, 'URL must be a non-empty string') 

99 

100 Non-string values are rejected: 

101 

102 >>> validate_url_format(None) 

103 (False, 'URL must be a non-empty string') 

104 >>> validate_url_format(123) 

105 (False, 'URL must be a non-empty string') 

106 

107 Absolute URLs without a valid host are rejected: 

108 

109 >>> validate_url_format("https://") 

110 (False, 'URL must have a valid host') 

111 

112 Malformed URLs that look like domains without scheme are rejected: 

113 

114 >>> validate_url_format("example.com") 

115 (False, "Invalid URL scheme '': looks like a domain without http:// or https://") 

116 >>> validate_url_format("://example.com") 

117 (False, "Invalid URL scheme '': malformed URL with '://' but no scheme") 

118 

119 """ 

120 if not isinstance(url, str) or not url.strip(): 

121 return False, "URL must be a non-empty string" 

122 

123 try: 

124 parsed = urlparse(url) 

125 except Exception as e: 

126 return False, f"Invalid URL: {e}" 

127 

128 # Block dangerous schemes 

129 dangerous_schemes = ("javascript", "data", "file", "vbscript", "about") 

130 if parsed.scheme in dangerous_schemes: 

131 return False, f"Invalid URL scheme '{parsed.scheme}': blocked for security" 

132 

133 # Handle URLs with no scheme 

134 if not parsed.scheme: 

135 # Reject malformed URLs like "://example.com" 

136 if url.startswith("://"): 

137 return False, "Invalid URL scheme '': malformed URL with '://' but no scheme" 

138 

139 # Reject domain-like strings without scheme (e.g., "example.com", "sub.example.com") 

140 # These look like absolute URLs missing the scheme 

141 # Valid relative paths typically start with ./, ../, or contain / early in the path 

142 if not url.startswith("./") and not url.startswith("../"): 

143 # Get the part before the first path separator 

144 first_part = url.split("/")[0].split("?")[0].split("#")[0] 

145 

146 # If it contains a dot and looks like a domain name (no path separators at all) 

147 # OR has multiple dot-separated parts suggesting a domain 

148 if "." in first_part: 

149 parts = first_part.split(".") 

150 # Domain-like: has 2+ parts and no empty parts (e.g., "example.com", "sub.example.com") 

151 if len(parts) >= MIN_DOMAIN_PARTS and all(part for part in parts): 

152 return False, "Invalid URL scheme '': looks like a domain without http:// or https://" 

153 

154 # Accept as relative path 

155 return True, None 

156 

157 # For absolute URLs, require http or https with a valid host 

158 if parsed.scheme in ("http", "https"): 

159 if not parsed.netloc: 

160 return False, "URL must have a valid host" 

161 return True, None 

162 

163 # Any other scheme is not allowed 

164 return False, f"Invalid URL scheme '{parsed.scheme}': only http, https, or relative paths allowed" 

165 

166 

167def validate_link_name(name: str) -> tuple[bool, str | None]: 

168 r"""Validate link name. 

169 

170 Ensures the link name is a non-empty string. Names are used as display 

171 text for links in the generated HTML. 

172 

173 Args: 

174 name: The link name string to validate. 

175 

176 Returns: 

177 A tuple of (is_valid, error_message). error_message is None if valid. 

178 

179 Examples: 

180 Valid non-empty strings return (True, None): 

181 

182 >>> validate_link_name("My Link") 

183 (True, None) 

184 >>> validate_link_name("GitHub") 

185 (True, None) 

186 >>> validate_link_name("A") 

187 (True, None) 

188 

189 Empty strings are rejected: 

190 

191 >>> validate_link_name("") 

192 (False, 'Name must be a non-empty string') 

193 

194 Whitespace-only strings are rejected: 

195 

196 >>> validate_link_name(" ") 

197 (False, 'Name must be a non-empty string') 

198 >>> validate_link_name("\t\n") 

199 (False, 'Name must be a non-empty string') 

200 

201 Non-string values are rejected: 

202 

203 >>> validate_link_name(None) 

204 (False, 'Name must be a non-empty string') 

205 >>> validate_link_name(123) 

206 (False, 'Name must be a non-empty string') 

207 >>> validate_link_name(["list"]) 

208 (False, 'Name must be a non-empty string') 

209 

210 """ 

211 if not isinstance(name, str) or not name.strip(): 

212 return False, "Name must be a non-empty string" 

213 return True, None 

214 

215 

216def get_git_repo_url() -> str: 

217 """Retrieve the GitHub repository URL. 

218 

219 Generates the GitHub repository URL based on the repository name 

220 retrieved from the environment variable 'GITHUB_REPOSITORY'. If the environment 

221 variable is not set, it defaults to 'tschm/minibook'. This URL can then be used 

222 for interactions with the repository. 

223 

224 Returns: 

225 The full URL for the GitHub repository. 

226 """ 

227 # Fallback to environment variable if git command fails 

228 github_repo = getenv("GITHUB_REPOSITORY", default="tschm/minibook") 

229 return f"https://github.com/{github_repo}" 

230 

231 

232def validate_url(url: str, timeout: int = 5, delay: float = 0) -> tuple[bool, str | None]: 

233 """Validate if a URL is accessible. 

234 

235 Args: 

236 url (str): The URL to validate 

237 timeout (int, optional): Timeout in seconds for the request 

238 delay (float, optional): Delay in seconds before making the request (rate limiting) 

239 

240 Returns: 

241 tuple: (is_valid, error_message) where is_valid is a boolean and error_message is a string 

242 error_message is None if the URL is valid 

243 

244 """ 

245 if delay > 0: 

246 time.sleep(delay) 

247 

248 try: 

249 # Make a HEAD request to check if the URL is accessible 

250 # HEAD is more efficient than GET as it doesn't download the full content 

251 response = requests.head(url, timeout=timeout, allow_redirects=True) 

252 

253 # If the HEAD request fails, try a GET request as some servers don't support HEAD 

254 if response.status_code >= HTTP_BAD_REQUEST: 

255 response = requests.get(url, timeout=timeout, allow_redirects=True) 

256 

257 # Check if the response status code indicates success 

258 if response.status_code < HTTP_BAD_REQUEST: 

259 return True, None 

260 else: 

261 return False, f"HTTP error: {response.status_code}" 

262 

263 except requests.exceptions.Timeout: 

264 return False, "Timeout error" 

265 except requests.exceptions.ConnectionError: 

266 return False, "Connection error" 

267 except requests.exceptions.RequestException as e: 

268 return False, f"Request error: {e!s}" 

269 except Exception as e: 

270 return False, f"Unexpected error: {e!s}" 

271 

272 

273def generate_html( 

274 title: str, 

275 links: list[tuple[str, str]], 

276 subtitle: str | None = None, 

277 output_file: str = "index.html", 

278 template_path: str | None = None, 

279) -> str: 

280 """Generate an HTML page with the given title and links using Jinja2. 

281 

282 Args: 

283 title (str): The title of the webpage 

284 links (list): A list of tuples with (name, url) 

285 subtitle (str, optional): A description to include on the page 

286 output_file (str, optional): The output HTML file 

287 template_path (str, optional): Path to a custom Jinja2 template file 

288 

289 Returns: 

290 str: The path to the generated HTML file 

291 

292 """ 

293 template = load_template(template_path) 

294 timestamp = get_timestamp() 

295 

296 # Generate a unique nonce for CSP 

297 nonce = secrets.token_urlsafe(16) 

298 

299 # Render the template with our data 

300 html = template.render( 

301 title=title, 

302 links=links, 

303 description=subtitle, 

304 timestamp=timestamp, 

305 repository_url=get_git_repo_url(), 

306 nonce=nonce, 

307 ) 

308 

309 # Save the HTML to a file 

310 with Path(output_file).open("w") as f: 

311 f.write(html) 

312 

313 return output_file 

314 

315 

316def parse_links_from_json(links_json: str) -> tuple[list[tuple[str, str]], list[str]]: 

317 """Parse links from a JSON string into a list of tuples. 

318 

319 Supports multiple JSON formats: 

320 - List of objects: [{"name": "...", "url": "..."}, ...] 

321 - List of arrays: [["name", "url"], ...] 

322 - Dictionary: {"name1": "url1", "name2": "url2", ...} 

323 

324 Validates that names and URLs are non-empty strings and that URLs use 

325 http or https schemes. Invalid items are skipped with warnings. 

326 

327 Args: 

328 links_json (str): JSON-formatted string containing links 

329 

330 Returns: 

331 tuple[list[tuple[str, str]], list[str]]: A tuple containing: 

332 - List of valid (name, url) tuples 

333 - List of warning messages for skipped items 

334 

335 Raises: 

336 json.JSONDecodeError: If the JSON string is invalid 

337 

338 """ 

339 cleaned_links = links_json.strip() 

340 json_data = json.loads(cleaned_links) 

341 

342 link_tuples = [] 

343 warnings = [] 

344 

345 def validate_and_append(name: str, url: str, context: str = "") -> None: 

346 """Validate a name/url pair and append if valid, otherwise add warning.""" 

347 # Validate name 

348 name_valid, name_error = validate_link_name(name) 

349 if not name_valid: 

350 warnings.append(f"Skipping item{context}: {name_error}") 

351 return 

352 

353 # Validate URL 

354 url_valid, url_error = validate_url_format(url) 

355 if not url_valid: 

356 warnings.append(f"Skipping '{name}'{context}: {url_error}") 

357 return 

358 

359 link_tuples.append((name, url)) 

360 

361 # Handle different JSON formats 

362 if isinstance(json_data, list): 

363 # If it's a list of lists/arrays: [["name", "url"], ...] 

364 if all(isinstance(item, list) for item in json_data): 

365 for i, item in enumerate(json_data): 

366 if len(item) >= MIN_LINK_ELEMENTS: 

367 validate_and_append(item[0], item[1], f" at index {i}") 

368 else: 

369 warnings.append( 

370 f"Skipping item at index {i}: array must have at least {MIN_LINK_ELEMENTS} elements" 

371 ) 

372 # If it's a list of objects: [{"name": "...", "url": "..."}, ...] 

373 elif all(isinstance(item, dict) for item in json_data): 

374 for i, item in enumerate(json_data): 

375 if "name" in item and "url" in item: 

376 validate_and_append(item["name"], item["url"], f" at index {i}") 

377 else: 

378 warnings.append(f"Skipping item at index {i}: missing 'name' or 'url' key") 

379 # If it's a dictionary: {"name1": "url1", "name2": "url2", ...} 

380 elif isinstance(json_data, dict): 

381 for name, url in json_data.items(): 

382 validate_and_append(name, url) 

383 

384 return link_tuples, warnings 

385 

386 

387def validate_link_list(link_tuples: list[tuple[str, str]], delay: float = 0) -> tuple[bool, list[tuple[str, str, str]]]: 

388 """Validate a list of links and return invalid ones. 

389 

390 Args: 

391 link_tuples (list[tuple[str, str]]): List of (name, url) tuples to validate 

392 delay (float, optional): Delay in seconds between requests (rate limiting) 

393 

394 Returns: 

395 tuple[bool, list[tuple[str, str, str]]]: A tuple containing: 

396 - bool: True if all links are valid, False otherwise 

397 - list: List of (name, url, error_message) tuples for invalid links 

398 

399 """ 

400 invalid_links: list[tuple[str, str, str]] = [] 

401 

402 with typer.progressbar(link_tuples) as progress: 

403 for name, url in progress: 

404 is_valid, error_message = validate_url(url, delay=delay) 

405 if not is_valid: 

406 # error_message is always set when is_valid is False 

407 invalid_links.append((name, url, error_message or "Unknown error")) 

408 

409 return len(invalid_links) == 0, invalid_links 

410 

411 

412def _handle_parsing(links: str) -> list[tuple[str, str]]: 

413 """Helper to parse links and handle errors.""" 

414 try: 

415 link_tuples, parse_warnings = parse_links_from_json(links) 

416 except (json.JSONDecodeError, TypeError): 

417 typer.echo("JSON parsing failed, falling back to legacy format") 

418 return [] 

419 

420 typer.echo(f"Parsed JSON links: {link_tuples}") 

421 

422 # Display warnings for skipped items 

423 if parse_warnings: 

424 typer.echo(f"\nWarning: {len(parse_warnings)} item(s) skipped due to validation errors:", err=True) 

425 for warning in parse_warnings: 

426 typer.echo(f" - {warning}", err=True) 

427 

428 return link_tuples 

429 

430 

431def _handle_validation(link_tuples: list[tuple[str, str]], request_delay: float) -> bool: 

432 """Helper to validate links and ask for confirmation.""" 

433 typer.echo("Validating links...") 

434 all_valid, invalid_links = validate_link_list(link_tuples, delay=request_delay) 

435 

436 # Report invalid links 

437 if not all_valid: 

438 typer.echo(f"\nFound {len(invalid_links)} invalid links:", err=True) 

439 for name, url, error in invalid_links: 

440 typer.echo(f" - {name} ({url}): {error}", err=True) 

441 

442 # Ask user if they want to continue 

443 return bool(typer.confirm("Do you want to continue with invalid links?")) 

444 else: 

445 typer.echo("All links are valid!") 

446 return True 

447 

448 

449def _generate_output(params: GenerationParams) -> int: 

450 """Helper to generate output using the appropriate plugin.""" 

451 from minibook.plugins import get_plugin 

452 

453 try: 

454 plugin_cls = get_plugin(params.output_format) 

455 except ValueError as e: 

456 typer.echo(f"Error: {e}", err=True) 

457 return 1 

458 

459 # Determine output filename based on format 

460 output_filenames = { 

461 "html": "index.html", 

462 "markdown": "links.md", 

463 "md": "links.md", 

464 "json": "links.json", 

465 "pdf": "links.pdf", 

466 } 

467 filename = output_filenames.get(params.output_format.lower(), f"output{plugin_cls.extension}") 

468 output_file = Path(params.output) / filename 

469 

470 try: 

471 from minibook.plugins import HTMLPlugin, OutputPlugin 

472 

473 # Create plugin instance (with template for HTML) 

474 is_html = params.output_format.lower() == "html" 

475 plugin: OutputPlugin 

476 if is_html and params.template: 

477 plugin = HTMLPlugin(template_path=params.template) 

478 else: 

479 plugin = plugin_cls() 

480 

481 output_path = plugin.generate(params.title, params.link_tuples, params.subtitle, output_file) 

482 except (FileNotFoundError, ImportError) as e: 

483 typer.echo(f"Error: {e}", err=True) 

484 return 1 

485 

486 typer.echo(f"{params.output_format.upper()} minibook created successfully: {Path(output_path).absolute()}") 

487 return 0 

488 

489 

490app = typer.Typer(help="Create a minibook from a list of links") 

491 

492 

493@app.command() # type: ignore[untyped-decorator] 

494def entrypoint( 

495 title: str = typer.Option("My Links", "--title", "-t", help="Title of the minibook"), 

496 subtitle: str | None = typer.Option(None, "--subtitle", help="Subtitle of the minibook"), 

497 output: str = typer.Option("artifacts", "--output", "-o", help="Output directory"), 

498 links: str = typer.Option( 

499 None, 

500 "--links", 

501 "-l", 

502 help="JSON formatted links: can be a list of objects with name/url keys, a list of arrays, or a dictionary", 

503 ), 

504 validate_links: bool = typer.Option(False, "--validate-links", help="Validate that all links are accessible"), 

505 request_delay: float = typer.Option( 

506 0.0, "--request-delay", help="Delay in seconds between URL validation requests (rate limiting)" 

507 ), 

508 output_format: str = typer.Option("html", "--format", "-f", help="Output format: html, markdown, json, or pdf"), 

509 template: str | None = typer.Option( 

510 None, "--template", help="Path to a custom Jinja2 template file for HTML output" 

511 ), 

512) -> int: 

513 """Create a minibook from a list of links.""" 

514 if links is None: 

515 typer.echo("No links provided. Exiting.", err=True) 

516 sys.exit(1) 

517 

518 typer.echo(f"Parsing links: {links}") 

519 

520 # Parse links from JSON 

521 link_tuples = _handle_parsing(links) 

522 if not link_tuples: 

523 # Exit if no valid links remain 

524 typer.echo("Error: No valid links to process.", err=True) 

525 return 1 

526 

527 # Validate links if requested 

528 if validate_links and not _handle_validation(link_tuples, request_delay): 

529 typer.echo("Aborting due to invalid links.", err=True) 

530 return 1 

531 

532 # Generate output 

533 params = GenerationParams( 

534 output_format=output_format, 

535 title=title, 

536 link_tuples=link_tuples, 

537 subtitle=subtitle, 

538 output=output, 

539 template=template, 

540 ) 

541 return _generate_output(params) 

542 

543 

544if __name__ == "__main__": 

545 app() # pragma: no cover