Coverage for src / minibook / main.py: 100%
200 statements
« prev ^ index » next coverage.py v7.13.4, created at 2026-03-14 15:04 +0000
« prev ^ index » next coverage.py v7.13.4, created at 2026-03-14 15:04 +0000
1"""MiniBook - A tool to create a webpage from a list of links.
3Generates a clean, responsive HTML webpage using Jinja2 templates.
4"""
6import configparser
7import json
8import secrets
9import sys
10import time
11from os import getenv
12from pathlib import Path
13from typing import NamedTuple
14from urllib.parse import urlparse
16import requests
17import typer
19from minibook.utils import get_timestamp, load_template
21# HTTP status codes
22HTTP_BAD_REQUEST = 400
24# Minimum elements in a list-formatted link
25MIN_LINK_ELEMENTS = 2
27# Minimum parts in a domain name
28MIN_DOMAIN_PARTS = 2
31class GenerationParams(NamedTuple):
32 """Parameters for minibook generation."""
34 output_format: str
35 title: str
36 link_tuples: list[tuple[str, str]]
37 subtitle: str | None
38 output: str
39 template: str | None
42def validate_url_format(url: str) -> tuple[bool, str | None]:
43 """Validate URL format and scheme.
45 Checks that the URL is a non-empty string with http or https scheme, or a relative path.
46 Blocks potentially dangerous schemes like javascript:, data:, and file:.
48 Args:
49 url: The URL string to validate.
51 Returns:
52 A tuple of (is_valid, error_message). error_message is None if valid.
54 Examples:
55 Valid HTTP and HTTPS URLs return (True, None):
57 >>> validate_url_format("https://example.com")
58 (True, None)
59 >>> validate_url_format("http://example.com/path/to/page")
60 (True, None)
61 >>> validate_url_format("https://example.com?query=value&foo=bar")
62 (True, None)
64 Relative paths are allowed for local file references:
66 >>> validate_url_format("./tests/html-report/report.html")
67 (True, None)
68 >>> validate_url_format("../docs/index.html")
69 (True, None)
70 >>> validate_url_format("path/to/file.html")
71 (True, None)
73 Note: Bare filenames without path separators or ./ prefix may be rejected
74 if they look like domain names (contain dots). Use explicit path notation:
76 >>> validate_url_format("./file.tar.gz")
77 (True, None)
79 JavaScript URLs are rejected to prevent XSS attacks:
81 >>> validate_url_format("javascript:alert(1)")
82 (False, "Invalid URL scheme 'javascript': blocked for security")
84 Data URLs are rejected to prevent code injection:
86 >>> validate_url_format("data:text/html,<script>alert(1)</script>")
87 (False, "Invalid URL scheme 'data': blocked for security")
89 File URLs are rejected to prevent local file access:
91 >>> validate_url_format("file:///etc/passwd")
92 (False, "Invalid URL scheme 'file': blocked for security")
94 Empty strings and whitespace-only strings are rejected:
96 >>> validate_url_format("")
97 (False, 'URL must be a non-empty string')
98 >>> validate_url_format(" ")
99 (False, 'URL must be a non-empty string')
101 Non-string values are rejected:
103 >>> validate_url_format(None)
104 (False, 'URL must be a non-empty string')
105 >>> validate_url_format(123)
106 (False, 'URL must be a non-empty string')
108 Absolute URLs without a valid host are rejected:
110 >>> validate_url_format("https://")
111 (False, 'URL must have a valid host')
113 Malformed URLs that look like domains without scheme are rejected:
115 >>> validate_url_format("example.com")
116 (False, "Invalid URL scheme '': looks like a domain without http:// or https://")
117 >>> validate_url_format("://example.com")
118 (False, "Invalid URL scheme '': malformed URL with '://' but no scheme")
120 """
121 if not isinstance(url, str) or not url.strip():
122 return False, "URL must be a non-empty string"
124 try:
125 parsed = urlparse(url)
126 except Exception as e:
127 return False, f"Invalid URL: {e}"
129 # Block dangerous schemes
130 dangerous_schemes = ("javascript", "data", "file", "vbscript", "about")
131 if parsed.scheme in dangerous_schemes:
132 return False, f"Invalid URL scheme '{parsed.scheme}': blocked for security"
134 # Handle URLs with no scheme
135 if not parsed.scheme:
136 # Reject malformed URLs like "://example.com"
137 if url.startswith("://"):
138 return False, "Invalid URL scheme '': malformed URL with '://' but no scheme"
140 # Reject domain-like strings without scheme (e.g., "example.com", "sub.example.com")
141 # These look like absolute URLs missing the scheme
142 # Valid relative paths typically start with ./, ../, or contain / early in the path
143 if not url.startswith("./") and not url.startswith("../"):
144 # Get the part before the first path separator
145 first_part = url.split("/")[0].split("?")[0].split("#")[0]
147 # If it contains a dot and looks like a domain name (no path separators at all)
148 # OR has multiple dot-separated parts suggesting a domain
149 if "." in first_part:
150 parts = first_part.split(".")
151 # Domain-like: has 2+ parts and no empty parts (e.g., "example.com", "sub.example.com")
152 if len(parts) >= MIN_DOMAIN_PARTS and all(part for part in parts):
153 return False, "Invalid URL scheme '': looks like a domain without http:// or https://"
155 # Accept as relative path
156 return True, None
158 # For absolute URLs, require http or https with a valid host
159 if parsed.scheme in ("http", "https"):
160 if not parsed.netloc:
161 return False, "URL must have a valid host"
162 return True, None
164 # Any other scheme is not allowed
165 return False, f"Invalid URL scheme '{parsed.scheme}': only http, https, or relative paths allowed"
168def validate_link_name(name: str) -> tuple[bool, str | None]:
169 r"""Validate link name.
171 Ensures the link name is a non-empty string. Names are used as display
172 text for links in the generated HTML.
174 Args:
175 name: The link name string to validate.
177 Returns:
178 A tuple of (is_valid, error_message). error_message is None if valid.
180 Examples:
181 Valid non-empty strings return (True, None):
183 >>> validate_link_name("My Link")
184 (True, None)
185 >>> validate_link_name("GitHub")
186 (True, None)
187 >>> validate_link_name("A")
188 (True, None)
190 Empty strings are rejected:
192 >>> validate_link_name("")
193 (False, 'Name must be a non-empty string')
195 Whitespace-only strings are rejected:
197 >>> validate_link_name(" ")
198 (False, 'Name must be a non-empty string')
199 >>> validate_link_name("\t\n")
200 (False, 'Name must be a non-empty string')
202 Non-string values are rejected:
204 >>> validate_link_name(None)
205 (False, 'Name must be a non-empty string')
206 >>> validate_link_name(123)
207 (False, 'Name must be a non-empty string')
208 >>> validate_link_name(["list"])
209 (False, 'Name must be a non-empty string')
211 """
212 if not isinstance(name, str) or not name.strip():
213 return False, "Name must be a non-empty string"
214 return True, None
217def get_git_repo_url() -> str:
218 """Retrieve the GitHub repository URL.
220 Checks, in order:
221 1. The ``GITHUB_REPOSITORY`` environment variable (e.g. set by GitHub Actions).
222 2. The ``[remote "origin"]`` URL in ``.git/config`` (works in local clones).
223 3. Falls back to the hardcoded default ``tschm/minibook``.
225 Returns:
226 The full HTTPS URL for the GitHub repository.
227 """
228 github_repo = getenv("GITHUB_REPOSITORY")
229 if github_repo:
230 return f"https://github.com/{github_repo}"
232 # Try to read the origin remote URL from .git/config
233 try:
234 git_config_path = Path(".git/config")
235 if git_config_path.exists():
236 config = configparser.ConfigParser()
237 config.read(str(git_config_path))
238 remote_url = config.get('remote "origin"', "url", fallback=None)
239 if remote_url:
240 # Normalize SSH URL (git@github.com:owner/repo.git) to HTTPS
241 if remote_url.startswith("git@github.com:"):
242 repo = remote_url[len("git@github.com:") :].removesuffix(".git")
243 return f"https://github.com/{repo}"
244 # Normalize HTTPS URL (https://github.com/owner/repo[.git])
245 if remote_url.startswith(("https://github.com/", "http://github.com/")):
246 repo = remote_url.split("github.com/", 1)[1].removesuffix(".git")
247 return f"https://github.com/{repo}"
248 except (OSError, configparser.Error):
249 pass
251 return "https://github.com/tschm/minibook"
254def validate_url(url: str, timeout: int = 5, delay: float = 0) -> tuple[bool, str | None]:
255 """Validate if a URL is accessible.
257 For HTTP/HTTPS URLs, makes a network request to check accessibility.
258 For relative paths, checks whether the file exists on the local filesystem.
260 Args:
261 url (str): The URL to validate
262 timeout (int, optional): Timeout in seconds for the request
263 delay (float, optional): Delay in seconds before making the request (rate limiting)
265 Returns:
266 tuple: (is_valid, error_message) where is_valid is a boolean and error_message is a string
267 error_message is None if the URL is valid
269 """
270 if delay > 0:
271 time.sleep(delay)
273 # Relative paths are validated by checking local filesystem accessibility
274 parsed = urlparse(url)
275 if not parsed.scheme or parsed.scheme not in ("http", "https"):
276 path = Path(url)
277 if path.exists():
278 return True, None
279 return False, f"Relative path not accessible: {url}"
281 try:
282 # Make a HEAD request to check if the URL is accessible
283 # HEAD is more efficient than GET as it doesn't download the full content
284 response = requests.head(url, timeout=timeout, allow_redirects=True)
286 # If the HEAD request fails, try a GET request as some servers don't support HEAD
287 if response.status_code >= HTTP_BAD_REQUEST:
288 response = requests.get(url, timeout=timeout, allow_redirects=True)
290 # Check if the response status code indicates success
291 if response.status_code < HTTP_BAD_REQUEST:
292 return True, None
293 else:
294 return False, f"HTTP error: {response.status_code}"
296 except requests.exceptions.Timeout:
297 return False, "Timeout error"
298 except requests.exceptions.ConnectionError:
299 return False, "Connection error"
300 except requests.exceptions.RequestException as e:
301 return False, f"Request error: {e!s}"
302 except Exception as e:
303 return False, f"Unexpected error: {e!s}"
306def generate_html(
307 title: str,
308 links: list[tuple[str, str]],
309 subtitle: str | None = None,
310 output_file: str = "index.html",
311 template_path: str | None = None,
312) -> str:
313 """Generate an HTML page with the given title and links using Jinja2.
315 Args:
316 title (str): The title of the webpage
317 links (list): A list of tuples with (name, url)
318 subtitle (str, optional): A description to include on the page
319 output_file (str, optional): The output HTML file
320 template_path (str, optional): Path to a custom Jinja2 template file
322 Returns:
323 str: The path to the generated HTML file
325 """
326 template = load_template(template_path)
327 timestamp = get_timestamp()
329 # Generate a unique nonce for CSP
330 nonce = secrets.token_urlsafe(16)
332 # Render the template with our data
333 html = template.render(
334 title=title,
335 links=links,
336 description=subtitle,
337 timestamp=timestamp,
338 repository_url=get_git_repo_url(),
339 nonce=nonce,
340 )
342 # Save the HTML to a file
343 with Path(output_file).open("w") as f:
344 f.write(html)
346 return output_file
349def parse_links_from_json(links_json: str) -> tuple[list[tuple[str, str]], list[str]]:
350 """Parse links from a JSON string into a list of tuples.
352 Supports multiple JSON formats:
353 - List of objects: [{"name": "...", "url": "..."}, ...]
354 - List of arrays: [["name", "url"], ...]
355 - Dictionary: {"name1": "url1", "name2": "url2", ...}
357 Validates that names and URLs are non-empty strings and that URLs use
358 http or https schemes. Invalid items are skipped with warnings.
360 Args:
361 links_json (str): JSON-formatted string containing links
363 Returns:
364 tuple[list[tuple[str, str]], list[str]]: A tuple containing:
365 - List of valid (name, url) tuples
366 - List of warning messages for skipped items
368 Raises:
369 json.JSONDecodeError: If the JSON string is invalid
371 """
372 cleaned_links = links_json.strip()
373 json_data = json.loads(cleaned_links)
375 link_tuples = []
376 warnings = []
378 def validate_and_append(name: str, url: str, context: str = "") -> None:
379 """Validate a name/url pair and append if valid, otherwise add warning."""
380 # Validate name
381 name_valid, name_error = validate_link_name(name)
382 if not name_valid:
383 warnings.append(f"Skipping item{context}: {name_error}")
384 return
386 # Validate URL
387 url_valid, url_error = validate_url_format(url)
388 if not url_valid:
389 warnings.append(f"Skipping '{name}'{context}: {url_error}")
390 return
392 link_tuples.append((name, url))
394 # Handle different JSON formats
395 if isinstance(json_data, list):
396 # If it's a list of lists/arrays: [["name", "url"], ...]
397 if all(isinstance(item, list) for item in json_data):
398 for i, item in enumerate(json_data):
399 if len(item) >= MIN_LINK_ELEMENTS:
400 validate_and_append(item[0], item[1], f" at index {i}")
401 else:
402 warnings.append(
403 f"Skipping item at index {i}: array must have at least {MIN_LINK_ELEMENTS} elements"
404 )
405 # If it's a list of objects: [{"name": "...", "url": "..."}, ...]
406 elif all(isinstance(item, dict) for item in json_data):
407 for i, item in enumerate(json_data):
408 if "name" in item and "url" in item:
409 validate_and_append(item["name"], item["url"], f" at index {i}")
410 else:
411 warnings.append(f"Skipping item at index {i}: missing 'name' or 'url' key")
412 # If it's a dictionary: {"name1": "url1", "name2": "url2", ...}
413 elif isinstance(json_data, dict):
414 for name, url in json_data.items():
415 validate_and_append(name, url)
417 return link_tuples, warnings
420def validate_link_list(link_tuples: list[tuple[str, str]], delay: float = 0) -> tuple[bool, list[tuple[str, str, str]]]:
421 """Validate a list of links and return invalid ones.
423 Args:
424 link_tuples (list[tuple[str, str]]): List of (name, url) tuples to validate
425 delay (float, optional): Delay in seconds between requests (rate limiting)
427 Returns:
428 tuple[bool, list[tuple[str, str, str]]]: A tuple containing:
429 - bool: True if all links are valid, False otherwise
430 - list: List of (name, url, error_message) tuples for invalid links
432 """
433 invalid_links: list[tuple[str, str, str]] = []
435 with typer.progressbar(link_tuples) as progress:
436 for name, url in progress:
437 is_valid, error_message = validate_url(url, delay=delay)
438 if not is_valid:
439 # error_message is always set when is_valid is False
440 invalid_links.append((name, url, error_message or "Unknown error"))
442 return len(invalid_links) == 0, invalid_links
445def _handle_parsing(links: str) -> list[tuple[str, str]]:
446 """Helper to parse links and handle errors."""
447 try:
448 link_tuples, parse_warnings = parse_links_from_json(links)
449 except (json.JSONDecodeError, TypeError) as e:
450 typer.echo(f"JSON parsing failed, falling back to legacy format: {e}")
451 return []
453 typer.echo(f"Parsed JSON links: {link_tuples}")
455 # Display warnings for skipped items
456 if parse_warnings:
457 typer.echo(f"\nWarning: {len(parse_warnings)} item(s) skipped due to validation errors:", err=True)
458 for warning in parse_warnings:
459 typer.echo(f" - {warning}", err=True)
461 return link_tuples
464def _handle_validation(link_tuples: list[tuple[str, str]], request_delay: float) -> bool:
465 """Helper to validate links and ask for confirmation."""
466 typer.echo("Validating links...")
467 all_valid, invalid_links = validate_link_list(link_tuples, delay=request_delay)
469 # Report invalid links
470 if not all_valid:
471 typer.echo(f"\nFound {len(invalid_links)} invalid links:", err=True)
472 for name, url, error in invalid_links:
473 typer.echo(f" - {name} ({url}): {error}", err=True)
475 # Ask user if they want to continue
476 return bool(typer.confirm("Do you want to continue with invalid links?"))
477 else:
478 typer.echo("All links are valid!")
479 return True
482def _generate_output(params: GenerationParams) -> int:
483 """Helper to generate output using the appropriate plugin."""
484 from minibook.plugins import get_plugin
486 try:
487 plugin_cls = get_plugin(params.output_format)
488 except ValueError as e:
489 typer.echo(f"Error: {e}", err=True)
490 return 1
492 # Determine output filename based on format
493 output_filenames = {
494 "html": "index.html",
495 "markdown": "links.md",
496 "md": "links.md",
497 "json": "links.json",
498 "pdf": "links.pdf",
499 }
500 filename = output_filenames.get(params.output_format.lower(), f"output{plugin_cls.extension}")
501 output_file = Path(params.output) / filename
503 try:
504 from minibook.plugins import HTMLPlugin, OutputPlugin
506 # Create plugin instance (with template for HTML)
507 is_html = params.output_format.lower() == "html"
508 plugin: OutputPlugin
509 plugin = HTMLPlugin(template_path=params.template) if is_html and params.template else plugin_cls()
511 output_path = plugin.generate(params.title, params.link_tuples, params.subtitle, output_file)
512 except (FileNotFoundError, ImportError) as e:
513 typer.echo(f"Error: {e}", err=True)
514 return 1
516 typer.echo(f"{params.output_format.upper()} minibook created successfully: {Path(output_path).absolute()}")
517 return 0
520app = typer.Typer(help="Create a minibook from a list of links")
523@app.command()
524def entrypoint(
525 title: str = typer.Option("My Links", "--title", "-t", help="Title of the minibook"),
526 subtitle: str | None = typer.Option(None, "--subtitle", help="Subtitle of the minibook"),
527 output: str = typer.Option("artifacts", "--output", "-o", help="Output directory"),
528 links: str = typer.Option(
529 None,
530 "--links",
531 "-l",
532 help="JSON formatted links: can be a list of objects with name/url keys, a list of arrays, or a dictionary",
533 ),
534 validate_links: bool = typer.Option(False, "--validate-links", help="Validate that all links are accessible"),
535 request_delay: float = typer.Option(
536 0.0, "--request-delay", help="Delay in seconds between URL validation requests (rate limiting)"
537 ),
538 output_format: str = typer.Option("html", "--format", "-f", help="Output format: html, markdown, json, or pdf"),
539 template: str | None = typer.Option(
540 None, "--template", help="Path to a custom Jinja2 template file for HTML output"
541 ),
542) -> int:
543 """Create a minibook from a list of links."""
544 if links is None:
545 typer.echo("No links provided. Exiting.", err=True)
546 sys.exit(1)
548 typer.echo(f"Parsing links: {links}")
550 # Parse links from JSON
551 link_tuples = _handle_parsing(links)
552 if not link_tuples:
553 # Exit if no valid links remain
554 typer.echo("Error: No valid links to process.", err=True)
555 return 1
557 # Validate links if requested
558 if validate_links and not _handle_validation(link_tuples, request_delay):
559 typer.echo("Aborting due to invalid links.", err=True)
560 return 1
562 # Generate output
563 params = GenerationParams(
564 output_format=output_format,
565 title=title,
566 link_tuples=link_tuples,
567 subtitle=subtitle,
568 output=output,
569 template=template,
570 )
571 return _generate_output(params)
574if __name__ == "__main__":
575 app() # pragma: no cover