feat(skills): add pdf-reader skill (#772)
Thanks @glifocat! Clean skill package — good docs, solid tests, nice intent files. Pushed a small fix for path traversal on the PDF filename before merging.
This commit is contained in:
@@ -0,0 +1,94 @@
|
||||
---
|
||||
name: pdf-reader
|
||||
description: Read and extract text from PDF files — documents, reports, contracts, spreadsheets. Use whenever you need to read PDF content, not just when explicitly asked. Handles local files, URLs, and WhatsApp attachments.
|
||||
allowed-tools: Bash(pdf-reader:*)
|
||||
---
|
||||
|
||||
# PDF Reader
|
||||
|
||||
## Quick start
|
||||
|
||||
```bash
|
||||
pdf-reader extract report.pdf # Extract all text
|
||||
pdf-reader extract report.pdf --layout # Preserve tables/columns
|
||||
pdf-reader fetch https://example.com/doc.pdf # Download and extract
|
||||
pdf-reader info report.pdf # Show metadata + size
|
||||
pdf-reader list # List all PDFs in directory tree
|
||||
```
|
||||
|
||||
## Commands
|
||||
|
||||
### extract — Extract text from PDF
|
||||
|
||||
```bash
|
||||
pdf-reader extract <file> # Full text to stdout
|
||||
pdf-reader extract <file> --layout # Preserve layout (tables, columns)
|
||||
pdf-reader extract <file> --pages 1-5 # Pages 1 through 5
|
||||
pdf-reader extract <file> --pages 3-3 # Single page (page 3)
|
||||
pdf-reader extract <file> --layout --pages 2-10 # Layout + page range
|
||||
```
|
||||
|
||||
Options:
|
||||
- `--layout` — Maintains spatial positioning. Essential for tables, spreadsheets, multi-column docs.
|
||||
- `--pages N-M` — Extract only pages N through M (1-based, inclusive).
|
||||
|
||||
### fetch — Download and extract PDF from URL
|
||||
|
||||
```bash
|
||||
pdf-reader fetch <url> # Download, verify, extract with layout
|
||||
pdf-reader fetch <url> report.pdf # Also save a local copy
|
||||
```
|
||||
|
||||
Downloads the PDF, verifies it has a valid `%PDF` header, then extracts text with layout preservation. Temporary files are cleaned up automatically.
|
||||
|
||||
### info — PDF metadata and file size
|
||||
|
||||
```bash
|
||||
pdf-reader info <file>
|
||||
```
|
||||
|
||||
Shows title, author, page count, page size, PDF version, and file size on disk.
|
||||
|
||||
### list — Find all PDFs in directory tree
|
||||
|
||||
```bash
|
||||
pdf-reader list
|
||||
```
|
||||
|
||||
Recursively lists all `.pdf` files with page count and file size.
|
||||
|
||||
## WhatsApp PDF attachments
|
||||
|
||||
When a user sends a PDF on WhatsApp, it is automatically saved to the `attachments/` directory. The message will include a path hint like:
|
||||
|
||||
> [PDF attached: attachments/document.pdf]
|
||||
|
||||
To read the attached PDF:
|
||||
|
||||
```bash
|
||||
pdf-reader extract attachments/document.pdf --layout
|
||||
```
|
||||
|
||||
## Example workflows
|
||||
|
||||
### Read a contract and summarize key terms
|
||||
|
||||
```bash
|
||||
pdf-reader info attachments/contract.pdf
|
||||
pdf-reader extract attachments/contract.pdf --layout
|
||||
```
|
||||
|
||||
### Extract specific pages from a long report
|
||||
|
||||
```bash
|
||||
pdf-reader info report.pdf # Check total pages
|
||||
pdf-reader extract report.pdf --pages 1-3 # Executive summary
|
||||
pdf-reader extract report.pdf --pages 15-20 # Financial tables
|
||||
```
|
||||
|
||||
### Fetch and analyze a public document
|
||||
|
||||
```bash
|
||||
pdf-reader fetch https://example.com/annual-report.pdf report.pdf
|
||||
pdf-reader info report.pdf
|
||||
```
|
||||
203
.claude/skills/add-pdf-reader/add/container/skills/pdf-reader/pdf-reader
Executable file
203
.claude/skills/add-pdf-reader/add/container/skills/pdf-reader/pdf-reader
Executable file
@@ -0,0 +1,203 @@
|
||||
#!/bin/bash
|
||||
set -euo pipefail
|
||||
|
||||
# pdf-reader — CLI wrapper around poppler-utils (pdftotext, pdfinfo)
|
||||
# Provides extract, fetch, info, list commands for PDF processing.
|
||||
|
||||
VERSION="1.0.0"
|
||||
|
||||
usage() {
|
||||
cat <<'USAGE'
|
||||
pdf-reader — Extract text and metadata from PDF files
|
||||
|
||||
Usage:
|
||||
pdf-reader extract <file> [--layout] [--pages N-M]
|
||||
pdf-reader fetch <url> [filename]
|
||||
pdf-reader info <file>
|
||||
pdf-reader list
|
||||
pdf-reader help
|
||||
|
||||
Commands:
|
||||
extract Extract text from a PDF file to stdout
|
||||
fetch Download a PDF from a URL and extract text
|
||||
info Show PDF metadata and file size
|
||||
list List all PDFs in current directory tree
|
||||
help Show this help message
|
||||
|
||||
Extract options:
|
||||
--layout Preserve original layout (tables, columns)
|
||||
--pages Page range to extract (e.g. 1-5, 3-3 for single page)
|
||||
USAGE
|
||||
}
|
||||
|
||||
cmd_extract() {
|
||||
local file=""
|
||||
local layout=false
|
||||
local first_page=""
|
||||
local last_page=""
|
||||
|
||||
# Parse arguments
|
||||
while [[ $# -gt 0 ]]; do
|
||||
case "$1" in
|
||||
--layout)
|
||||
layout=true
|
||||
shift
|
||||
;;
|
||||
--pages)
|
||||
if [[ -z "${2:-}" ]]; then
|
||||
echo "Error: --pages requires a range argument (e.g. 1-5)" >&2
|
||||
exit 1
|
||||
fi
|
||||
local range="$2"
|
||||
first_page="${range%-*}"
|
||||
last_page="${range#*-}"
|
||||
shift 2
|
||||
;;
|
||||
-*)
|
||||
echo "Error: Unknown option: $1" >&2
|
||||
exit 1
|
||||
;;
|
||||
*)
|
||||
if [[ -z "$file" ]]; then
|
||||
file="$1"
|
||||
else
|
||||
echo "Error: Unexpected argument: $1" >&2
|
||||
exit 1
|
||||
fi
|
||||
shift
|
||||
;;
|
||||
esac
|
||||
done
|
||||
|
||||
if [[ -z "$file" ]]; then
|
||||
echo "Error: No file specified" >&2
|
||||
echo "Usage: pdf-reader extract <file> [--layout] [--pages N-M]" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [[ ! -f "$file" ]]; then
|
||||
echo "Error: File not found: $file" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Build pdftotext arguments
|
||||
local args=()
|
||||
if [[ "$layout" == true ]]; then
|
||||
args+=(-layout)
|
||||
fi
|
||||
if [[ -n "$first_page" ]]; then
|
||||
args+=(-f "$first_page")
|
||||
fi
|
||||
if [[ -n "$last_page" ]]; then
|
||||
args+=(-l "$last_page")
|
||||
fi
|
||||
|
||||
pdftotext ${args[@]+"${args[@]}"} "$file" -
|
||||
}
|
||||
|
||||
cmd_fetch() {
|
||||
local url="${1:-}"
|
||||
local filename="${2:-}"
|
||||
|
||||
if [[ -z "$url" ]]; then
|
||||
echo "Error: No URL specified" >&2
|
||||
echo "Usage: pdf-reader fetch <url> [filename]" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Create temporary file
|
||||
local tmpfile
|
||||
tmpfile="$(mktemp /tmp/pdf-reader-XXXXXX.pdf)"
|
||||
trap 'rm -f "$tmpfile"' EXIT
|
||||
|
||||
# Download
|
||||
echo "Downloading: $url" >&2
|
||||
if ! curl -sL -o "$tmpfile" "$url"; then
|
||||
echo "Error: Failed to download: $url" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Verify PDF header
|
||||
local header
|
||||
header="$(head -c 4 "$tmpfile")"
|
||||
if [[ "$header" != "%PDF" ]]; then
|
||||
echo "Error: Downloaded file is not a valid PDF (header: $header)" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Save with name if requested
|
||||
if [[ -n "$filename" ]]; then
|
||||
cp "$tmpfile" "$filename"
|
||||
echo "Saved to: $filename" >&2
|
||||
fi
|
||||
|
||||
# Extract with layout
|
||||
pdftotext -layout "$tmpfile" -
|
||||
}
|
||||
|
||||
cmd_info() {
|
||||
local file="${1:-}"
|
||||
|
||||
if [[ -z "$file" ]]; then
|
||||
echo "Error: No file specified" >&2
|
||||
echo "Usage: pdf-reader info <file>" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [[ ! -f "$file" ]]; then
|
||||
echo "Error: File not found: $file" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
pdfinfo "$file"
|
||||
echo ""
|
||||
echo "File size: $(du -h "$file" | cut -f1)"
|
||||
}
|
||||
|
||||
cmd_list() {
|
||||
local found=false
|
||||
|
||||
# Use globbing to find PDFs (globstar makes **/ match recursively)
|
||||
shopt -s nullglob globstar
|
||||
|
||||
# Use associative array to deduplicate (*.pdf overlaps with **/*.pdf)
|
||||
declare -A seen
|
||||
for pdf in *.pdf **/*.pdf; do
|
||||
[[ -v seen["$pdf"] ]] && continue
|
||||
seen["$pdf"]=1
|
||||
found=true
|
||||
|
||||
local pages="?"
|
||||
local size
|
||||
size="$(du -h "$pdf" | cut -f1)"
|
||||
|
||||
# Try to get page count from pdfinfo
|
||||
if page_line="$(pdfinfo "$pdf" 2>/dev/null | grep '^Pages:')"; then
|
||||
pages="$(echo "$page_line" | awk '{print $2}')"
|
||||
fi
|
||||
|
||||
printf "%-60s %5s pages %8s\n" "$pdf" "$pages" "$size"
|
||||
done
|
||||
|
||||
if [[ "$found" == false ]]; then
|
||||
echo "No PDF files found in current directory tree." >&2
|
||||
fi
|
||||
}
|
||||
|
||||
# Main dispatch
|
||||
command="${1:-help}"
|
||||
shift || true
|
||||
|
||||
case "$command" in
|
||||
extract) cmd_extract "$@" ;;
|
||||
fetch) cmd_fetch "$@" ;;
|
||||
info) cmd_info "$@" ;;
|
||||
list) cmd_list ;;
|
||||
help|--help|-h) usage ;;
|
||||
version|--version|-v) echo "pdf-reader $VERSION" ;;
|
||||
*)
|
||||
echo "Error: Unknown command: $command" >&2
|
||||
echo "Run 'pdf-reader help' for usage." >&2
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
Reference in New Issue
Block a user