Coverage for src / minibook / main.py: 100%

200 statements  

« prev     ^ index     » next       coverage.py v7.13.4, created at 2026-03-14 15:04 +0000

1"""MiniBook - A tool to create a webpage from a list of links. 

2 

3Generates a clean, responsive HTML webpage using Jinja2 templates. 

4""" 

5 

6import configparser 

7import json 

8import secrets 

9import sys 

10import time 

11from os import getenv 

12from pathlib import Path 

13from typing import NamedTuple 

14from urllib.parse import urlparse 

15 

16import requests 

17import typer 

18 

19from minibook.utils import get_timestamp, load_template 

20 

21# HTTP status codes 

22HTTP_BAD_REQUEST = 400 

23 

24# Minimum elements in a list-formatted link 

25MIN_LINK_ELEMENTS = 2 

26 

27# Minimum parts in a domain name 

28MIN_DOMAIN_PARTS = 2 

29 

30 

31class GenerationParams(NamedTuple): 

32 """Parameters for minibook generation.""" 

33 

34 output_format: str 

35 title: str 

36 link_tuples: list[tuple[str, str]] 

37 subtitle: str | None 

38 output: str 

39 template: str | None 

40 

41 

42def validate_url_format(url: str) -> tuple[bool, str | None]: 

43 """Validate URL format and scheme. 

44 

45 Checks that the URL is a non-empty string with http or https scheme, or a relative path. 

46 Blocks potentially dangerous schemes like javascript:, data:, and file:. 

47 

48 Args: 

49 url: The URL string to validate. 

50 

51 Returns: 

52 A tuple of (is_valid, error_message). error_message is None if valid. 

53 

54 Examples: 

55 Valid HTTP and HTTPS URLs return (True, None): 

56 

57 >>> validate_url_format("https://example.com") 

58 (True, None) 

59 >>> validate_url_format("http://example.com/path/to/page") 

60 (True, None) 

61 >>> validate_url_format("https://example.com?query=value&foo=bar") 

62 (True, None) 

63 

64 Relative paths are allowed for local file references: 

65 

66 >>> validate_url_format("./tests/html-report/report.html") 

67 (True, None) 

68 >>> validate_url_format("../docs/index.html") 

69 (True, None) 

70 >>> validate_url_format("path/to/file.html") 

71 (True, None) 

72 

73 Note: Bare filenames without path separators or ./ prefix may be rejected 

74 if they look like domain names (contain dots). Use explicit path notation: 

75 

76 >>> validate_url_format("./file.tar.gz") 

77 (True, None) 

78 

79 JavaScript URLs are rejected to prevent XSS attacks: 

80 

81 >>> validate_url_format("javascript:alert(1)") 

82 (False, "Invalid URL scheme 'javascript': blocked for security") 

83 

84 Data URLs are rejected to prevent code injection: 

85 

86 >>> validate_url_format("data:text/html,<script>alert(1)</script>") 

87 (False, "Invalid URL scheme 'data': blocked for security") 

88 

89 File URLs are rejected to prevent local file access: 

90 

91 >>> validate_url_format("file:///etc/passwd") 

92 (False, "Invalid URL scheme 'file': blocked for security") 

93 

94 Empty strings and whitespace-only strings are rejected: 

95 

96 >>> validate_url_format("") 

97 (False, 'URL must be a non-empty string') 

98 >>> validate_url_format(" ") 

99 (False, 'URL must be a non-empty string') 

100 

101 Non-string values are rejected: 

102 

103 >>> validate_url_format(None) 

104 (False, 'URL must be a non-empty string') 

105 >>> validate_url_format(123) 

106 (False, 'URL must be a non-empty string') 

107 

108 Absolute URLs without a valid host are rejected: 

109 

110 >>> validate_url_format("https://") 

111 (False, 'URL must have a valid host') 

112 

113 Malformed URLs that look like domains without scheme are rejected: 

114 

115 >>> validate_url_format("example.com") 

116 (False, "Invalid URL scheme '': looks like a domain without http:// or https://") 

117 >>> validate_url_format("://example.com") 

118 (False, "Invalid URL scheme '': malformed URL with '://' but no scheme") 

119 

120 """ 

121 if not isinstance(url, str) or not url.strip(): 

122 return False, "URL must be a non-empty string" 

123 

124 try: 

125 parsed = urlparse(url) 

126 except Exception as e: 

127 return False, f"Invalid URL: {e}" 

128 

129 # Block dangerous schemes 

130 dangerous_schemes = ("javascript", "data", "file", "vbscript", "about") 

131 if parsed.scheme in dangerous_schemes: 

132 return False, f"Invalid URL scheme '{parsed.scheme}': blocked for security" 

133 

134 # Handle URLs with no scheme 

135 if not parsed.scheme: 

136 # Reject malformed URLs like "://example.com" 

137 if url.startswith("://"): 

138 return False, "Invalid URL scheme '': malformed URL with '://' but no scheme" 

139 

140 # Reject domain-like strings without scheme (e.g., "example.com", "sub.example.com") 

141 # These look like absolute URLs missing the scheme 

142 # Valid relative paths typically start with ./, ../, or contain / early in the path 

143 if not url.startswith("./") and not url.startswith("../"): 

144 # Get the part before the first path separator 

145 first_part = url.split("/")[0].split("?")[0].split("#")[0] 

146 

147 # If it contains a dot and looks like a domain name (no path separators at all) 

148 # OR has multiple dot-separated parts suggesting a domain 

149 if "." in first_part: 

150 parts = first_part.split(".") 

151 # Domain-like: has 2+ parts and no empty parts (e.g., "example.com", "sub.example.com") 

152 if len(parts) >= MIN_DOMAIN_PARTS and all(part for part in parts): 

153 return False, "Invalid URL scheme '': looks like a domain without http:// or https://" 

154 

155 # Accept as relative path 

156 return True, None 

157 

158 # For absolute URLs, require http or https with a valid host 

159 if parsed.scheme in ("http", "https"): 

160 if not parsed.netloc: 

161 return False, "URL must have a valid host" 

162 return True, None 

163 

164 # Any other scheme is not allowed 

165 return False, f"Invalid URL scheme '{parsed.scheme}': only http, https, or relative paths allowed" 

166 

167 

168def validate_link_name(name: str) -> tuple[bool, str | None]: 

169 r"""Validate link name. 

170 

171 Ensures the link name is a non-empty string. Names are used as display 

172 text for links in the generated HTML. 

173 

174 Args: 

175 name: The link name string to validate. 

176 

177 Returns: 

178 A tuple of (is_valid, error_message). error_message is None if valid. 

179 

180 Examples: 

181 Valid non-empty strings return (True, None): 

182 

183 >>> validate_link_name("My Link") 

184 (True, None) 

185 >>> validate_link_name("GitHub") 

186 (True, None) 

187 >>> validate_link_name("A") 

188 (True, None) 

189 

190 Empty strings are rejected: 

191 

192 >>> validate_link_name("") 

193 (False, 'Name must be a non-empty string') 

194 

195 Whitespace-only strings are rejected: 

196 

197 >>> validate_link_name(" ") 

198 (False, 'Name must be a non-empty string') 

199 >>> validate_link_name("\t\n") 

200 (False, 'Name must be a non-empty string') 

201 

202 Non-string values are rejected: 

203 

204 >>> validate_link_name(None) 

205 (False, 'Name must be a non-empty string') 

206 >>> validate_link_name(123) 

207 (False, 'Name must be a non-empty string') 

208 >>> validate_link_name(["list"]) 

209 (False, 'Name must be a non-empty string') 

210 

211 """ 

212 if not isinstance(name, str) or not name.strip(): 

213 return False, "Name must be a non-empty string" 

214 return True, None 

215 

216 

217def get_git_repo_url() -> str: 

218 """Retrieve the GitHub repository URL. 

219 

220 Checks, in order: 

221 1. The ``GITHUB_REPOSITORY`` environment variable (e.g. set by GitHub Actions). 

222 2. The ``[remote "origin"]`` URL in ``.git/config`` (works in local clones). 

223 3. Falls back to the hardcoded default ``tschm/minibook``. 

224 

225 Returns: 

226 The full HTTPS URL for the GitHub repository. 

227 """ 

228 github_repo = getenv("GITHUB_REPOSITORY") 

229 if github_repo: 

230 return f"https://github.com/{github_repo}" 

231 

232 # Try to read the origin remote URL from .git/config 

233 try: 

234 git_config_path = Path(".git/config") 

235 if git_config_path.exists(): 

236 config = configparser.ConfigParser() 

237 config.read(str(git_config_path)) 

238 remote_url = config.get('remote "origin"', "url", fallback=None) 

239 if remote_url: 

240 # Normalize SSH URL (git@github.com:owner/repo.git) to HTTPS 

241 if remote_url.startswith("git@github.com:"): 

242 repo = remote_url[len("git@github.com:") :].removesuffix(".git") 

243 return f"https://github.com/{repo}" 

244 # Normalize HTTPS URL (https://github.com/owner/repo[.git]) 

245 if remote_url.startswith(("https://github.com/", "http://github.com/")): 

246 repo = remote_url.split("github.com/", 1)[1].removesuffix(".git") 

247 return f"https://github.com/{repo}" 

248 except (OSError, configparser.Error): 

249 pass 

250 

251 return "https://github.com/tschm/minibook" 

252 

253 

254def validate_url(url: str, timeout: int = 5, delay: float = 0) -> tuple[bool, str | None]: 

255 """Validate if a URL is accessible. 

256 

257 For HTTP/HTTPS URLs, makes a network request to check accessibility. 

258 For relative paths, checks whether the file exists on the local filesystem. 

259 

260 Args: 

261 url (str): The URL to validate 

262 timeout (int, optional): Timeout in seconds for the request 

263 delay (float, optional): Delay in seconds before making the request (rate limiting) 

264 

265 Returns: 

266 tuple: (is_valid, error_message) where is_valid is a boolean and error_message is a string 

267 error_message is None if the URL is valid 

268 

269 """ 

270 if delay > 0: 

271 time.sleep(delay) 

272 

273 # Relative paths are validated by checking local filesystem accessibility 

274 parsed = urlparse(url) 

275 if not parsed.scheme or parsed.scheme not in ("http", "https"): 

276 path = Path(url) 

277 if path.exists(): 

278 return True, None 

279 return False, f"Relative path not accessible: {url}" 

280 

281 try: 

282 # Make a HEAD request to check if the URL is accessible 

283 # HEAD is more efficient than GET as it doesn't download the full content 

284 response = requests.head(url, timeout=timeout, allow_redirects=True) 

285 

286 # If the HEAD request fails, try a GET request as some servers don't support HEAD 

287 if response.status_code >= HTTP_BAD_REQUEST: 

288 response = requests.get(url, timeout=timeout, allow_redirects=True) 

289 

290 # Check if the response status code indicates success 

291 if response.status_code < HTTP_BAD_REQUEST: 

292 return True, None 

293 else: 

294 return False, f"HTTP error: {response.status_code}" 

295 

296 except requests.exceptions.Timeout: 

297 return False, "Timeout error" 

298 except requests.exceptions.ConnectionError: 

299 return False, "Connection error" 

300 except requests.exceptions.RequestException as e: 

301 return False, f"Request error: {e!s}" 

302 except Exception as e: 

303 return False, f"Unexpected error: {e!s}" 

304 

305 

306def generate_html( 

307 title: str, 

308 links: list[tuple[str, str]], 

309 subtitle: str | None = None, 

310 output_file: str = "index.html", 

311 template_path: str | None = None, 

312) -> str: 

313 """Generate an HTML page with the given title and links using Jinja2. 

314 

315 Args: 

316 title (str): The title of the webpage 

317 links (list): A list of tuples with (name, url) 

318 subtitle (str, optional): A description to include on the page 

319 output_file (str, optional): The output HTML file 

320 template_path (str, optional): Path to a custom Jinja2 template file 

321 

322 Returns: 

323 str: The path to the generated HTML file 

324 

325 """ 

326 template = load_template(template_path) 

327 timestamp = get_timestamp() 

328 

329 # Generate a unique nonce for CSP 

330 nonce = secrets.token_urlsafe(16) 

331 

332 # Render the template with our data 

333 html = template.render( 

334 title=title, 

335 links=links, 

336 description=subtitle, 

337 timestamp=timestamp, 

338 repository_url=get_git_repo_url(), 

339 nonce=nonce, 

340 ) 

341 

342 # Save the HTML to a file 

343 with Path(output_file).open("w") as f: 

344 f.write(html) 

345 

346 return output_file 

347 

348 

349def parse_links_from_json(links_json: str) -> tuple[list[tuple[str, str]], list[str]]: 

350 """Parse links from a JSON string into a list of tuples. 

351 

352 Supports multiple JSON formats: 

353 - List of objects: [{"name": "...", "url": "..."}, ...] 

354 - List of arrays: [["name", "url"], ...] 

355 - Dictionary: {"name1": "url1", "name2": "url2", ...} 

356 

357 Validates that names and URLs are non-empty strings and that URLs use 

358 http or https schemes. Invalid items are skipped with warnings. 

359 

360 Args: 

361 links_json (str): JSON-formatted string containing links 

362 

363 Returns: 

364 tuple[list[tuple[str, str]], list[str]]: A tuple containing: 

365 - List of valid (name, url) tuples 

366 - List of warning messages for skipped items 

367 

368 Raises: 

369 json.JSONDecodeError: If the JSON string is invalid 

370 

371 """ 

372 cleaned_links = links_json.strip() 

373 json_data = json.loads(cleaned_links) 

374 

375 link_tuples = [] 

376 warnings = [] 

377 

378 def validate_and_append(name: str, url: str, context: str = "") -> None: 

379 """Validate a name/url pair and append if valid, otherwise add warning.""" 

380 # Validate name 

381 name_valid, name_error = validate_link_name(name) 

382 if not name_valid: 

383 warnings.append(f"Skipping item{context}: {name_error}") 

384 return 

385 

386 # Validate URL 

387 url_valid, url_error = validate_url_format(url) 

388 if not url_valid: 

389 warnings.append(f"Skipping '{name}'{context}: {url_error}") 

390 return 

391 

392 link_tuples.append((name, url)) 

393 

394 # Handle different JSON formats 

395 if isinstance(json_data, list): 

396 # If it's a list of lists/arrays: [["name", "url"], ...] 

397 if all(isinstance(item, list) for item in json_data): 

398 for i, item in enumerate(json_data): 

399 if len(item) >= MIN_LINK_ELEMENTS: 

400 validate_and_append(item[0], item[1], f" at index {i}") 

401 else: 

402 warnings.append( 

403 f"Skipping item at index {i}: array must have at least {MIN_LINK_ELEMENTS} elements" 

404 ) 

405 # If it's a list of objects: [{"name": "...", "url": "..."}, ...] 

406 elif all(isinstance(item, dict) for item in json_data): 

407 for i, item in enumerate(json_data): 

408 if "name" in item and "url" in item: 

409 validate_and_append(item["name"], item["url"], f" at index {i}") 

410 else: 

411 warnings.append(f"Skipping item at index {i}: missing 'name' or 'url' key") 

412 # If it's a dictionary: {"name1": "url1", "name2": "url2", ...} 

413 elif isinstance(json_data, dict): 

414 for name, url in json_data.items(): 

415 validate_and_append(name, url) 

416 

417 return link_tuples, warnings 

418 

419 

420def validate_link_list(link_tuples: list[tuple[str, str]], delay: float = 0) -> tuple[bool, list[tuple[str, str, str]]]: 

421 """Validate a list of links and return invalid ones. 

422 

423 Args: 

424 link_tuples (list[tuple[str, str]]): List of (name, url) tuples to validate 

425 delay (float, optional): Delay in seconds between requests (rate limiting) 

426 

427 Returns: 

428 tuple[bool, list[tuple[str, str, str]]]: A tuple containing: 

429 - bool: True if all links are valid, False otherwise 

430 - list: List of (name, url, error_message) tuples for invalid links 

431 

432 """ 

433 invalid_links: list[tuple[str, str, str]] = [] 

434 

435 with typer.progressbar(link_tuples) as progress: 

436 for name, url in progress: 

437 is_valid, error_message = validate_url(url, delay=delay) 

438 if not is_valid: 

439 # error_message is always set when is_valid is False 

440 invalid_links.append((name, url, error_message or "Unknown error")) 

441 

442 return len(invalid_links) == 0, invalid_links 

443 

444 

445def _handle_parsing(links: str) -> list[tuple[str, str]]: 

446 """Helper to parse links and handle errors.""" 

447 try: 

448 link_tuples, parse_warnings = parse_links_from_json(links) 

449 except (json.JSONDecodeError, TypeError) as e: 

450 typer.echo(f"JSON parsing failed, falling back to legacy format: {e}") 

451 return [] 

452 

453 typer.echo(f"Parsed JSON links: {link_tuples}") 

454 

455 # Display warnings for skipped items 

456 if parse_warnings: 

457 typer.echo(f"\nWarning: {len(parse_warnings)} item(s) skipped due to validation errors:", err=True) 

458 for warning in parse_warnings: 

459 typer.echo(f" - {warning}", err=True) 

460 

461 return link_tuples 

462 

463 

464def _handle_validation(link_tuples: list[tuple[str, str]], request_delay: float) -> bool: 

465 """Helper to validate links and ask for confirmation.""" 

466 typer.echo("Validating links...") 

467 all_valid, invalid_links = validate_link_list(link_tuples, delay=request_delay) 

468 

469 # Report invalid links 

470 if not all_valid: 

471 typer.echo(f"\nFound {len(invalid_links)} invalid links:", err=True) 

472 for name, url, error in invalid_links: 

473 typer.echo(f" - {name} ({url}): {error}", err=True) 

474 

475 # Ask user if they want to continue 

476 return bool(typer.confirm("Do you want to continue with invalid links?")) 

477 else: 

478 typer.echo("All links are valid!") 

479 return True 

480 

481 

482def _generate_output(params: GenerationParams) -> int: 

483 """Helper to generate output using the appropriate plugin.""" 

484 from minibook.plugins import get_plugin 

485 

486 try: 

487 plugin_cls = get_plugin(params.output_format) 

488 except ValueError as e: 

489 typer.echo(f"Error: {e}", err=True) 

490 return 1 

491 

492 # Determine output filename based on format 

493 output_filenames = { 

494 "html": "index.html", 

495 "markdown": "links.md", 

496 "md": "links.md", 

497 "json": "links.json", 

498 "pdf": "links.pdf", 

499 } 

500 filename = output_filenames.get(params.output_format.lower(), f"output{plugin_cls.extension}") 

501 output_file = Path(params.output) / filename 

502 

503 try: 

504 from minibook.plugins import HTMLPlugin, OutputPlugin 

505 

506 # Create plugin instance (with template for HTML) 

507 is_html = params.output_format.lower() == "html" 

508 plugin: OutputPlugin 

509 plugin = HTMLPlugin(template_path=params.template) if is_html and params.template else plugin_cls() 

510 

511 output_path = plugin.generate(params.title, params.link_tuples, params.subtitle, output_file) 

512 except (FileNotFoundError, ImportError) as e: 

513 typer.echo(f"Error: {e}", err=True) 

514 return 1 

515 

516 typer.echo(f"{params.output_format.upper()} minibook created successfully: {Path(output_path).absolute()}") 

517 return 0 

518 

519 

520app = typer.Typer(help="Create a minibook from a list of links") 

521 

522 

523@app.command() 

524def entrypoint( 

525 title: str = typer.Option("My Links", "--title", "-t", help="Title of the minibook"), 

526 subtitle: str | None = typer.Option(None, "--subtitle", help="Subtitle of the minibook"), 

527 output: str = typer.Option("artifacts", "--output", "-o", help="Output directory"), 

528 links: str = typer.Option( 

529 None, 

530 "--links", 

531 "-l", 

532 help="JSON formatted links: can be a list of objects with name/url keys, a list of arrays, or a dictionary", 

533 ), 

534 validate_links: bool = typer.Option(False, "--validate-links", help="Validate that all links are accessible"), 

535 request_delay: float = typer.Option( 

536 0.0, "--request-delay", help="Delay in seconds between URL validation requests (rate limiting)" 

537 ), 

538 output_format: str = typer.Option("html", "--format", "-f", help="Output format: html, markdown, json, or pdf"), 

539 template: str | None = typer.Option( 

540 None, "--template", help="Path to a custom Jinja2 template file for HTML output" 

541 ), 

542) -> int: 

543 """Create a minibook from a list of links.""" 

544 if links is None: 

545 typer.echo("No links provided. Exiting.", err=True) 

546 sys.exit(1) 

547 

548 typer.echo(f"Parsing links: {links}") 

549 

550 # Parse links from JSON 

551 link_tuples = _handle_parsing(links) 

552 if not link_tuples: 

553 # Exit if no valid links remain 

554 typer.echo("Error: No valid links to process.", err=True) 

555 return 1 

556 

557 # Validate links if requested 

558 if validate_links and not _handle_validation(link_tuples, request_delay): 

559 typer.echo("Aborting due to invalid links.", err=True) 

560 return 1 

561 

562 # Generate output 

563 params = GenerationParams( 

564 output_format=output_format, 

565 title=title, 

566 link_tuples=link_tuples, 

567 subtitle=subtitle, 

568 output=output, 

569 template=template, 

570 ) 

571 return _generate_output(params) 

572 

573 

574if __name__ == "__main__": 

575 app() # pragma: no cover