Coverage for src / minibook / main.py: 100%
178 statements
« prev ^ index » next coverage.py v7.13.2, created at 2026-01-27 10:03 +0000
« prev ^ index » next coverage.py v7.13.2, created at 2026-01-27 10:03 +0000
1"""MiniBook - A tool to create a webpage from a list of links.
3Generates a clean, responsive HTML webpage using Jinja2 templates.
4"""
6import json
7import secrets
8import sys
9import time
10from os import getenv
11from pathlib import Path
12from typing import NamedTuple
13from urllib.parse import urlparse
15import requests
16import typer
18from minibook.utils import get_timestamp, load_template
20# HTTP status codes
21HTTP_BAD_REQUEST = 400
23# Minimum elements in a list-formatted link
24MIN_LINK_ELEMENTS = 2
26# Minimum parts in a domain name
27MIN_DOMAIN_PARTS = 2
30class GenerationParams(NamedTuple):
31 """Parameters for minibook generation."""
33 output_format: str
34 title: str
35 link_tuples: list[tuple[str, str]]
36 subtitle: str | None
37 output: str
38 template: str | None
41def validate_url_format(url: str) -> tuple[bool, str | None]:
42 """Validate URL format and scheme.
44 Checks that the URL is a non-empty string with http or https scheme, or a relative path.
45 Blocks potentially dangerous schemes like javascript:, data:, and file:.
47 Args:
48 url: The URL string to validate.
50 Returns:
51 A tuple of (is_valid, error_message). error_message is None if valid.
53 Examples:
54 Valid HTTP and HTTPS URLs return (True, None):
56 >>> validate_url_format("https://example.com")
57 (True, None)
58 >>> validate_url_format("http://example.com/path/to/page")
59 (True, None)
60 >>> validate_url_format("https://example.com?query=value&foo=bar")
61 (True, None)
63 Relative paths are allowed for local file references:
65 >>> validate_url_format("./tests/html-report/report.html")
66 (True, None)
67 >>> validate_url_format("../docs/index.html")
68 (True, None)
69 >>> validate_url_format("path/to/file.html")
70 (True, None)
72 Note: Bare filenames without path separators or ./ prefix may be rejected
73 if they look like domain names (contain dots). Use explicit path notation:
75 >>> validate_url_format("./file.tar.gz")
76 (True, None)
78 JavaScript URLs are rejected to prevent XSS attacks:
80 >>> validate_url_format("javascript:alert(1)")
81 (False, "Invalid URL scheme 'javascript': blocked for security")
83 Data URLs are rejected to prevent code injection:
85 >>> validate_url_format("data:text/html,<script>alert(1)</script>")
86 (False, "Invalid URL scheme 'data': blocked for security")
88 File URLs are rejected to prevent local file access:
90 >>> validate_url_format("file:///etc/passwd")
91 (False, "Invalid URL scheme 'file': blocked for security")
93 Empty strings and whitespace-only strings are rejected:
95 >>> validate_url_format("")
96 (False, 'URL must be a non-empty string')
97 >>> validate_url_format(" ")
98 (False, 'URL must be a non-empty string')
100 Non-string values are rejected:
102 >>> validate_url_format(None)
103 (False, 'URL must be a non-empty string')
104 >>> validate_url_format(123)
105 (False, 'URL must be a non-empty string')
107 Absolute URLs without a valid host are rejected:
109 >>> validate_url_format("https://")
110 (False, 'URL must have a valid host')
112 Malformed URLs that look like domains without scheme are rejected:
114 >>> validate_url_format("example.com")
115 (False, "Invalid URL scheme '': looks like a domain without http:// or https://")
116 >>> validate_url_format("://example.com")
117 (False, "Invalid URL scheme '': malformed URL with '://' but no scheme")
119 """
120 if not isinstance(url, str) or not url.strip():
121 return False, "URL must be a non-empty string"
123 try:
124 parsed = urlparse(url)
125 except Exception as e:
126 return False, f"Invalid URL: {e}"
128 # Block dangerous schemes
129 dangerous_schemes = ("javascript", "data", "file", "vbscript", "about")
130 if parsed.scheme in dangerous_schemes:
131 return False, f"Invalid URL scheme '{parsed.scheme}': blocked for security"
133 # Handle URLs with no scheme
134 if not parsed.scheme:
135 # Reject malformed URLs like "://example.com"
136 if url.startswith("://"):
137 return False, "Invalid URL scheme '': malformed URL with '://' but no scheme"
139 # Reject domain-like strings without scheme (e.g., "example.com", "sub.example.com")
140 # These look like absolute URLs missing the scheme
141 # Valid relative paths typically start with ./, ../, or contain / early in the path
142 if not url.startswith("./") and not url.startswith("../"):
143 # Get the part before the first path separator
144 first_part = url.split("/")[0].split("?")[0].split("#")[0]
146 # If it contains a dot and looks like a domain name (no path separators at all)
147 # OR has multiple dot-separated parts suggesting a domain
148 if "." in first_part:
149 parts = first_part.split(".")
150 # Domain-like: has 2+ parts and no empty parts (e.g., "example.com", "sub.example.com")
151 if len(parts) >= MIN_DOMAIN_PARTS and all(part for part in parts):
152 return False, "Invalid URL scheme '': looks like a domain without http:// or https://"
154 # Accept as relative path
155 return True, None
157 # For absolute URLs, require http or https with a valid host
158 if parsed.scheme in ("http", "https"):
159 if not parsed.netloc:
160 return False, "URL must have a valid host"
161 return True, None
163 # Any other scheme is not allowed
164 return False, f"Invalid URL scheme '{parsed.scheme}': only http, https, or relative paths allowed"
167def validate_link_name(name: str) -> tuple[bool, str | None]:
168 r"""Validate link name.
170 Ensures the link name is a non-empty string. Names are used as display
171 text for links in the generated HTML.
173 Args:
174 name: The link name string to validate.
176 Returns:
177 A tuple of (is_valid, error_message). error_message is None if valid.
179 Examples:
180 Valid non-empty strings return (True, None):
182 >>> validate_link_name("My Link")
183 (True, None)
184 >>> validate_link_name("GitHub")
185 (True, None)
186 >>> validate_link_name("A")
187 (True, None)
189 Empty strings are rejected:
191 >>> validate_link_name("")
192 (False, 'Name must be a non-empty string')
194 Whitespace-only strings are rejected:
196 >>> validate_link_name(" ")
197 (False, 'Name must be a non-empty string')
198 >>> validate_link_name("\t\n")
199 (False, 'Name must be a non-empty string')
201 Non-string values are rejected:
203 >>> validate_link_name(None)
204 (False, 'Name must be a non-empty string')
205 >>> validate_link_name(123)
206 (False, 'Name must be a non-empty string')
207 >>> validate_link_name(["list"])
208 (False, 'Name must be a non-empty string')
210 """
211 if not isinstance(name, str) or not name.strip():
212 return False, "Name must be a non-empty string"
213 return True, None
216def get_git_repo_url() -> str:
217 """Retrieve the GitHub repository URL.
219 Generates the GitHub repository URL based on the repository name
220 retrieved from the environment variable 'GITHUB_REPOSITORY'. If the environment
221 variable is not set, it defaults to 'tschm/minibook'. This URL can then be used
222 for interactions with the repository.
224 Returns:
225 The full URL for the GitHub repository.
226 """
227 # Fallback to environment variable if git command fails
228 github_repo = getenv("GITHUB_REPOSITORY", default="tschm/minibook")
229 return f"https://github.com/{github_repo}"
232def validate_url(url: str, timeout: int = 5, delay: float = 0) -> tuple[bool, str | None]:
233 """Validate if a URL is accessible.
235 Args:
236 url (str): The URL to validate
237 timeout (int, optional): Timeout in seconds for the request
238 delay (float, optional): Delay in seconds before making the request (rate limiting)
240 Returns:
241 tuple: (is_valid, error_message) where is_valid is a boolean and error_message is a string
242 error_message is None if the URL is valid
244 """
245 if delay > 0:
246 time.sleep(delay)
248 try:
249 # Make a HEAD request to check if the URL is accessible
250 # HEAD is more efficient than GET as it doesn't download the full content
251 response = requests.head(url, timeout=timeout, allow_redirects=True)
253 # If the HEAD request fails, try a GET request as some servers don't support HEAD
254 if response.status_code >= HTTP_BAD_REQUEST:
255 response = requests.get(url, timeout=timeout, allow_redirects=True)
257 # Check if the response status code indicates success
258 if response.status_code < HTTP_BAD_REQUEST:
259 return True, None
260 else:
261 return False, f"HTTP error: {response.status_code}"
263 except requests.exceptions.Timeout:
264 return False, "Timeout error"
265 except requests.exceptions.ConnectionError:
266 return False, "Connection error"
267 except requests.exceptions.RequestException as e:
268 return False, f"Request error: {e!s}"
269 except Exception as e:
270 return False, f"Unexpected error: {e!s}"
273def generate_html(
274 title: str,
275 links: list[tuple[str, str]],
276 subtitle: str | None = None,
277 output_file: str = "index.html",
278 template_path: str | None = None,
279) -> str:
280 """Generate an HTML page with the given title and links using Jinja2.
282 Args:
283 title (str): The title of the webpage
284 links (list): A list of tuples with (name, url)
285 subtitle (str, optional): A description to include on the page
286 output_file (str, optional): The output HTML file
287 template_path (str, optional): Path to a custom Jinja2 template file
289 Returns:
290 str: The path to the generated HTML file
292 """
293 template = load_template(template_path)
294 timestamp = get_timestamp()
296 # Generate a unique nonce for CSP
297 nonce = secrets.token_urlsafe(16)
299 # Render the template with our data
300 html = template.render(
301 title=title,
302 links=links,
303 description=subtitle,
304 timestamp=timestamp,
305 repository_url=get_git_repo_url(),
306 nonce=nonce,
307 )
309 # Save the HTML to a file
310 with Path(output_file).open("w") as f:
311 f.write(html)
313 return output_file
316def parse_links_from_json(links_json: str) -> tuple[list[tuple[str, str]], list[str]]:
317 """Parse links from a JSON string into a list of tuples.
319 Supports multiple JSON formats:
320 - List of objects: [{"name": "...", "url": "..."}, ...]
321 - List of arrays: [["name", "url"], ...]
322 - Dictionary: {"name1": "url1", "name2": "url2", ...}
324 Validates that names and URLs are non-empty strings and that URLs use
325 http or https schemes. Invalid items are skipped with warnings.
327 Args:
328 links_json (str): JSON-formatted string containing links
330 Returns:
331 tuple[list[tuple[str, str]], list[str]]: A tuple containing:
332 - List of valid (name, url) tuples
333 - List of warning messages for skipped items
335 Raises:
336 json.JSONDecodeError: If the JSON string is invalid
338 """
339 cleaned_links = links_json.strip()
340 json_data = json.loads(cleaned_links)
342 link_tuples = []
343 warnings = []
345 def validate_and_append(name: str, url: str, context: str = "") -> None:
346 """Validate a name/url pair and append if valid, otherwise add warning."""
347 # Validate name
348 name_valid, name_error = validate_link_name(name)
349 if not name_valid:
350 warnings.append(f"Skipping item{context}: {name_error}")
351 return
353 # Validate URL
354 url_valid, url_error = validate_url_format(url)
355 if not url_valid:
356 warnings.append(f"Skipping '{name}'{context}: {url_error}")
357 return
359 link_tuples.append((name, url))
361 # Handle different JSON formats
362 if isinstance(json_data, list):
363 # If it's a list of lists/arrays: [["name", "url"], ...]
364 if all(isinstance(item, list) for item in json_data):
365 for i, item in enumerate(json_data):
366 if len(item) >= MIN_LINK_ELEMENTS:
367 validate_and_append(item[0], item[1], f" at index {i}")
368 else:
369 warnings.append(
370 f"Skipping item at index {i}: array must have at least {MIN_LINK_ELEMENTS} elements"
371 )
372 # If it's a list of objects: [{"name": "...", "url": "..."}, ...]
373 elif all(isinstance(item, dict) for item in json_data):
374 for i, item in enumerate(json_data):
375 if "name" in item and "url" in item:
376 validate_and_append(item["name"], item["url"], f" at index {i}")
377 else:
378 warnings.append(f"Skipping item at index {i}: missing 'name' or 'url' key")
379 # If it's a dictionary: {"name1": "url1", "name2": "url2", ...}
380 elif isinstance(json_data, dict):
381 for name, url in json_data.items():
382 validate_and_append(name, url)
384 return link_tuples, warnings
387def validate_link_list(link_tuples: list[tuple[str, str]], delay: float = 0) -> tuple[bool, list[tuple[str, str, str]]]:
388 """Validate a list of links and return invalid ones.
390 Args:
391 link_tuples (list[tuple[str, str]]): List of (name, url) tuples to validate
392 delay (float, optional): Delay in seconds between requests (rate limiting)
394 Returns:
395 tuple[bool, list[tuple[str, str, str]]]: A tuple containing:
396 - bool: True if all links are valid, False otherwise
397 - list: List of (name, url, error_message) tuples for invalid links
399 """
400 invalid_links: list[tuple[str, str, str]] = []
402 with typer.progressbar(link_tuples) as progress:
403 for name, url in progress:
404 is_valid, error_message = validate_url(url, delay=delay)
405 if not is_valid:
406 # error_message is always set when is_valid is False
407 invalid_links.append((name, url, error_message or "Unknown error"))
409 return len(invalid_links) == 0, invalid_links
412def _handle_parsing(links: str) -> list[tuple[str, str]]:
413 """Helper to parse links and handle errors."""
414 try:
415 link_tuples, parse_warnings = parse_links_from_json(links)
416 except (json.JSONDecodeError, TypeError):
417 typer.echo("JSON parsing failed, falling back to legacy format")
418 return []
420 typer.echo(f"Parsed JSON links: {link_tuples}")
422 # Display warnings for skipped items
423 if parse_warnings:
424 typer.echo(f"\nWarning: {len(parse_warnings)} item(s) skipped due to validation errors:", err=True)
425 for warning in parse_warnings:
426 typer.echo(f" - {warning}", err=True)
428 return link_tuples
431def _handle_validation(link_tuples: list[tuple[str, str]], request_delay: float) -> bool:
432 """Helper to validate links and ask for confirmation."""
433 typer.echo("Validating links...")
434 all_valid, invalid_links = validate_link_list(link_tuples, delay=request_delay)
436 # Report invalid links
437 if not all_valid:
438 typer.echo(f"\nFound {len(invalid_links)} invalid links:", err=True)
439 for name, url, error in invalid_links:
440 typer.echo(f" - {name} ({url}): {error}", err=True)
442 # Ask user if they want to continue
443 return bool(typer.confirm("Do you want to continue with invalid links?"))
444 else:
445 typer.echo("All links are valid!")
446 return True
449def _generate_output(params: GenerationParams) -> int:
450 """Helper to generate output using the appropriate plugin."""
451 from minibook.plugins import get_plugin
453 try:
454 plugin_cls = get_plugin(params.output_format)
455 except ValueError as e:
456 typer.echo(f"Error: {e}", err=True)
457 return 1
459 # Determine output filename based on format
460 output_filenames = {
461 "html": "index.html",
462 "markdown": "links.md",
463 "md": "links.md",
464 "json": "links.json",
465 "pdf": "links.pdf",
466 }
467 filename = output_filenames.get(params.output_format.lower(), f"output{plugin_cls.extension}")
468 output_file = Path(params.output) / filename
470 try:
471 from minibook.plugins import HTMLPlugin, OutputPlugin
473 # Create plugin instance (with template for HTML)
474 is_html = params.output_format.lower() == "html"
475 plugin: OutputPlugin
476 if is_html and params.template:
477 plugin = HTMLPlugin(template_path=params.template)
478 else:
479 plugin = plugin_cls()
481 output_path = plugin.generate(params.title, params.link_tuples, params.subtitle, output_file)
482 except (FileNotFoundError, ImportError) as e:
483 typer.echo(f"Error: {e}", err=True)
484 return 1
486 typer.echo(f"{params.output_format.upper()} minibook created successfully: {Path(output_path).absolute()}")
487 return 0
490app = typer.Typer(help="Create a minibook from a list of links")
493@app.command() # type: ignore[untyped-decorator]
494def entrypoint(
495 title: str = typer.Option("My Links", "--title", "-t", help="Title of the minibook"),
496 subtitle: str | None = typer.Option(None, "--subtitle", help="Subtitle of the minibook"),
497 output: str = typer.Option("artifacts", "--output", "-o", help="Output directory"),
498 links: str = typer.Option(
499 None,
500 "--links",
501 "-l",
502 help="JSON formatted links: can be a list of objects with name/url keys, a list of arrays, or a dictionary",
503 ),
504 validate_links: bool = typer.Option(False, "--validate-links", help="Validate that all links are accessible"),
505 request_delay: float = typer.Option(
506 0.0, "--request-delay", help="Delay in seconds between URL validation requests (rate limiting)"
507 ),
508 output_format: str = typer.Option("html", "--format", "-f", help="Output format: html, markdown, json, or pdf"),
509 template: str | None = typer.Option(
510 None, "--template", help="Path to a custom Jinja2 template file for HTML output"
511 ),
512) -> int:
513 """Create a minibook from a list of links."""
514 if links is None:
515 typer.echo("No links provided. Exiting.", err=True)
516 sys.exit(1)
518 typer.echo(f"Parsing links: {links}")
520 # Parse links from JSON
521 link_tuples = _handle_parsing(links)
522 if not link_tuples:
523 # Exit if no valid links remain
524 typer.echo("Error: No valid links to process.", err=True)
525 return 1
527 # Validate links if requested
528 if validate_links and not _handle_validation(link_tuples, request_delay):
529 typer.echo("Aborting due to invalid links.", err=True)
530 return 1
532 # Generate output
533 params = GenerationParams(
534 output_format=output_format,
535 title=title,
536 link_tuples=link_tuples,
537 subtitle=subtitle,
538 output=output,
539 template=template,
540 )
541 return _generate_output(params)
544if __name__ == "__main__":
545 app() # pragma: no cover