From 03f792bfce2b12449ab2fcaffffc35ce346da8e4 Mon Sep 17 00:00:00 2001 From: glifocat Date: Wed, 4 Mar 2026 14:56:31 +0100 Subject: [PATCH] feat(skills): add use-local-whisper skill package (#702) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Thanks for the great contribution @glifocat! This is a really well-structured skill — clean package, thorough docs, and solid test coverage. Hope to see more skills like this from you! --- .../modify/src/channels/whatsapp.test.ts | 4 + .../src/channels/whatsapp.test.ts.intent.md | 1 + .claude/skills/use-local-whisper/SKILL.md | 128 ++++++++++++++++++ .../skills/use-local-whisper/manifest.yaml | 12 ++ .../modify/src/transcription.ts | 95 +++++++++++++ .../modify/src/transcription.ts.intent.md | 39 ++++++ .../tests/use-local-whisper.test.ts | 115 ++++++++++++++++ 7 files changed, 394 insertions(+) create mode 100644 .claude/skills/use-local-whisper/SKILL.md create mode 100644 .claude/skills/use-local-whisper/manifest.yaml create mode 100644 .claude/skills/use-local-whisper/modify/src/transcription.ts create mode 100644 .claude/skills/use-local-whisper/modify/src/transcription.ts.intent.md create mode 100644 .claude/skills/use-local-whisper/tests/use-local-whisper.test.ts diff --git a/.claude/skills/add-voice-transcription/modify/src/channels/whatsapp.test.ts b/.claude/skills/add-voice-transcription/modify/src/channels/whatsapp.test.ts index b56c6c4..b6ef502 100644 --- a/.claude/skills/add-voice-transcription/modify/src/channels/whatsapp.test.ts +++ b/.claude/skills/add-voice-transcription/modify/src/channels/whatsapp.test.ts @@ -90,6 +90,10 @@ vi.mock('@whiskeysockets/baileys', () => { timedOut: 408, restartRequired: 515, }, + fetchLatestWaWebVersion: vi + .fn() + .mockResolvedValue({ version: [2, 3000, 0] }), + normalizeMessageContent: vi.fn((content: unknown) => content), makeCacheableSignalKeyStore: vi.fn((keys: unknown) => keys), useMultiFileAuthState: vi.fn().mockResolvedValue({ state: { diff --git a/.claude/skills/add-voice-transcription/modify/src/channels/whatsapp.test.ts.intent.md b/.claude/skills/add-voice-transcription/modify/src/channels/whatsapp.test.ts.intent.md index 5856320..a07e7f0 100644 --- a/.claude/skills/add-voice-transcription/modify/src/channels/whatsapp.test.ts.intent.md +++ b/.claude/skills/add-voice-transcription/modify/src/channels/whatsapp.test.ts.intent.md @@ -8,6 +8,7 @@ Added mock for the transcription module and 3 new test cases for voice message h ### Mocks (top of file) - Added: `vi.mock('../transcription.js', ...)` with `isVoiceMessage` and `transcribeAudioMessage` mocks - Added: `import { transcribeAudioMessage } from '../transcription.js'` for test assertions +- Updated: Baileys mock to include `fetchLatestWaWebVersion` and `normalizeMessageContent` exports (required by current upstream whatsapp.ts) ### Test cases (inside "message handling" describe block) - Changed: "handles message with no extractable text (e.g. voice note without caption)" → "transcribes voice messages" diff --git a/.claude/skills/use-local-whisper/SKILL.md b/.claude/skills/use-local-whisper/SKILL.md new file mode 100644 index 0000000..7620b0f --- /dev/null +++ b/.claude/skills/use-local-whisper/SKILL.md @@ -0,0 +1,128 @@ +--- +name: use-local-whisper +description: Use when the user wants local voice transcription instead of OpenAI Whisper API. Switches to whisper.cpp running on Apple Silicon. WhatsApp only for now. Requires voice-transcription skill to be applied first. +--- + +# Use Local Whisper + +Switches voice transcription from OpenAI's Whisper API to local whisper.cpp. Runs entirely on-device — no API key, no network, no cost. + +**Channel support:** Currently WhatsApp only. The transcription module (`src/transcription.ts`) uses Baileys types for audio download. Other channels (Telegram, Discord, etc.) would need their own audio-download logic before this skill can serve them. + +**Note:** The Homebrew package is `whisper-cpp`, but the CLI binary it installs is `whisper-cli`. + +## Prerequisites + +- `voice-transcription` skill must be applied first (WhatsApp channel) +- macOS with Apple Silicon (M1+) recommended +- `whisper-cpp` installed: `brew install whisper-cpp` (provides the `whisper-cli` binary) +- `ffmpeg` installed: `brew install ffmpeg` +- A GGML model file downloaded to `data/models/` + +## Phase 1: Pre-flight + +### Check if already applied + +Read `.nanoclaw/state.yaml`. If `use-local-whisper` is in `applied_skills`, skip to Phase 3 (Verify). + +### Check dependencies are installed + +```bash +whisper-cli --help >/dev/null 2>&1 && echo "WHISPER_OK" || echo "WHISPER_MISSING" +ffmpeg -version >/dev/null 2>&1 && echo "FFMPEG_OK" || echo "FFMPEG_MISSING" +``` + +If missing, install via Homebrew: +```bash +brew install whisper-cpp ffmpeg +``` + +### Check for model file + +```bash +ls data/models/ggml-*.bin 2>/dev/null || echo "NO_MODEL" +``` + +If no model exists, download the base model (148MB, good balance of speed and accuracy): +```bash +mkdir -p data/models +curl -L -o data/models/ggml-base.bin "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.bin" +``` + +For better accuracy at the cost of speed, use `ggml-small.bin` (466MB) or `ggml-medium.bin` (1.5GB). + +## Phase 2: Apply Code Changes + +```bash +npx tsx scripts/apply-skill.ts .claude/skills/use-local-whisper +``` + +This modifies `src/transcription.ts` to use the `whisper-cli` binary instead of the OpenAI API. + +### Validate + +```bash +npm test +npm run build +``` + +## Phase 3: Verify + +### Ensure launchd PATH includes Homebrew + +The NanoClaw launchd service runs with a restricted PATH. `whisper-cli` and `ffmpeg` are in `/opt/homebrew/bin/` (Apple Silicon) or `/usr/local/bin/` (Intel), which may not be in the plist's PATH. + +Check the current PATH: +```bash +grep -A1 'PATH' ~/Library/LaunchAgents/com.nanoclaw.plist +``` + +If `/opt/homebrew/bin` is missing, add it to the `` value inside the `PATH` key in the plist. Then reload: +```bash +launchctl unload ~/Library/LaunchAgents/com.nanoclaw.plist +launchctl load ~/Library/LaunchAgents/com.nanoclaw.plist +``` + +### Build and restart + +```bash +npm run build +launchctl kickstart -k gui/$(id -u)/com.nanoclaw +``` + +### Test + +Send a voice note in any registered group. The agent should receive it as `[Voice: ]`. + +### Check logs + +```bash +tail -f logs/nanoclaw.log | grep -i -E "voice|transcri|whisper" +``` + +Look for: +- `Transcribed voice message` — successful transcription +- `whisper.cpp transcription failed` — check model path, ffmpeg, or PATH + +## Configuration + +Environment variables (optional, set in `.env`): + +| Variable | Default | Description | +|----------|---------|-------------| +| `WHISPER_BIN` | `whisper-cli` | Path to whisper.cpp binary | +| `WHISPER_MODEL` | `data/models/ggml-base.bin` | Path to GGML model file | + +## Troubleshooting + +**"whisper.cpp transcription failed"**: Ensure both `whisper-cli` and `ffmpeg` are in PATH. The launchd service uses a restricted PATH — see Phase 3 above. Test manually: +```bash +ffmpeg -f lavfi -i anullsrc=r=16000:cl=mono -t 1 -f wav /tmp/test.wav -y +whisper-cli -m data/models/ggml-base.bin -f /tmp/test.wav --no-timestamps -nt +``` + +**Transcription works in dev but not as service**: The launchd plist PATH likely doesn't include `/opt/homebrew/bin`. See "Ensure launchd PATH includes Homebrew" in Phase 3. + +**Slow transcription**: The base model processes ~30s of audio in <1s on M1+. If slower, check CPU usage — another process may be competing. + +**Wrong language**: whisper.cpp auto-detects language. To force a language, you can set `WHISPER_LANG` and modify `src/transcription.ts` to pass `-l $WHISPER_LANG`. diff --git a/.claude/skills/use-local-whisper/manifest.yaml b/.claude/skills/use-local-whisper/manifest.yaml new file mode 100644 index 0000000..3ca356d --- /dev/null +++ b/.claude/skills/use-local-whisper/manifest.yaml @@ -0,0 +1,12 @@ +skill: use-local-whisper +version: 1.0.0 +description: "Switch voice transcription from OpenAI Whisper API to local whisper.cpp (WhatsApp only)" +core_version: 0.1.0 +adds: [] +modifies: + - src/transcription.ts +structured: {} +conflicts: [] +depends: + - voice-transcription +test: "npx vitest run src/channels/whatsapp.test.ts" diff --git a/.claude/skills/use-local-whisper/modify/src/transcription.ts b/.claude/skills/use-local-whisper/modify/src/transcription.ts new file mode 100644 index 0000000..45f39fc --- /dev/null +++ b/.claude/skills/use-local-whisper/modify/src/transcription.ts @@ -0,0 +1,95 @@ +import { execFile } from 'child_process'; +import fs from 'fs'; +import os from 'os'; +import path from 'path'; +import { promisify } from 'util'; + +import { downloadMediaMessage, WAMessage, WASocket } from '@whiskeysockets/baileys'; + +const execFileAsync = promisify(execFile); + +const WHISPER_BIN = process.env.WHISPER_BIN || 'whisper-cli'; +const WHISPER_MODEL = + process.env.WHISPER_MODEL || + path.join(process.cwd(), 'data', 'models', 'ggml-base.bin'); + +const FALLBACK_MESSAGE = '[Voice Message - transcription unavailable]'; + +async function transcribeWithWhisperCpp( + audioBuffer: Buffer, +): Promise { + const tmpDir = os.tmpdir(); + const id = `nanoclaw-voice-${Date.now()}`; + const tmpOgg = path.join(tmpDir, `${id}.ogg`); + const tmpWav = path.join(tmpDir, `${id}.wav`); + + try { + fs.writeFileSync(tmpOgg, audioBuffer); + + // Convert ogg/opus to 16kHz mono WAV (required by whisper.cpp) + await execFileAsync('ffmpeg', [ + '-i', tmpOgg, + '-ar', '16000', + '-ac', '1', + '-f', 'wav', + '-y', tmpWav, + ], { timeout: 30_000 }); + + const { stdout } = await execFileAsync(WHISPER_BIN, [ + '-m', WHISPER_MODEL, + '-f', tmpWav, + '--no-timestamps', + '-nt', + ], { timeout: 60_000 }); + + const transcript = stdout.trim(); + return transcript || null; + } catch (err) { + console.error('whisper.cpp transcription failed:', err); + return null; + } finally { + for (const f of [tmpOgg, tmpWav]) { + try { fs.unlinkSync(f); } catch { /* best effort cleanup */ } + } + } +} + +export async function transcribeAudioMessage( + msg: WAMessage, + sock: WASocket, +): Promise { + try { + const buffer = (await downloadMediaMessage( + msg, + 'buffer', + {}, + { + logger: console as any, + reuploadRequest: sock.updateMediaMessage, + }, + )) as Buffer; + + if (!buffer || buffer.length === 0) { + console.error('Failed to download audio message'); + return FALLBACK_MESSAGE; + } + + console.log(`Downloaded audio message: ${buffer.length} bytes`); + + const transcript = await transcribeWithWhisperCpp(buffer); + + if (!transcript) { + return FALLBACK_MESSAGE; + } + + console.log(`Transcribed voice message: ${transcript.length} chars`); + return transcript.trim(); + } catch (err) { + console.error('Transcription error:', err); + return FALLBACK_MESSAGE; + } +} + +export function isVoiceMessage(msg: WAMessage): boolean { + return msg.message?.audioMessage?.ptt === true; +} diff --git a/.claude/skills/use-local-whisper/modify/src/transcription.ts.intent.md b/.claude/skills/use-local-whisper/modify/src/transcription.ts.intent.md new file mode 100644 index 0000000..47dabf1 --- /dev/null +++ b/.claude/skills/use-local-whisper/modify/src/transcription.ts.intent.md @@ -0,0 +1,39 @@ +# Intent: src/transcription.ts modifications + +## What changed +Replaced the OpenAI Whisper API backend with local whisper.cpp CLI execution. Audio is converted from ogg/opus to 16kHz mono WAV via ffmpeg, then transcribed locally using whisper-cpp. No API key or network required. + +## Key sections + +### Imports +- Removed: `readEnvFile` from `./env.js` (no API key needed) +- Added: `execFile` from `child_process`, `fs`, `os`, `path`, `promisify` from `util` + +### Configuration +- Removed: `TranscriptionConfig` interface and `DEFAULT_CONFIG` (no model/enabled/fallback config) +- Added: `WHISPER_BIN` constant (env `WHISPER_BIN` or `'whisper-cli'`) +- Added: `WHISPER_MODEL` constant (env `WHISPER_MODEL` or `data/models/ggml-base.bin`) +- Added: `FALLBACK_MESSAGE` constant + +### transcribeWithWhisperCpp (replaces transcribeWithOpenAI) +- Writes audio buffer to temp .ogg file +- Converts to 16kHz mono WAV via ffmpeg +- Runs whisper-cpp CLI with `--no-timestamps -nt` flags +- Cleans up temp files in finally block +- Returns trimmed stdout or null on error + +### transcribeAudioMessage +- Same signature: `(msg: WAMessage, sock: WASocket) => Promise` +- Same download logic via `downloadMediaMessage` +- Calls `transcribeWithWhisperCpp` instead of `transcribeWithOpenAI` +- Same fallback behavior on error/null + +### isVoiceMessage +- Unchanged: `msg.message?.audioMessage?.ptt === true` + +## Invariants (must-keep) +- `transcribeAudioMessage` export signature unchanged +- `isVoiceMessage` export unchanged +- Fallback message strings unchanged: `[Voice Message - transcription unavailable]` +- downloadMediaMessage call pattern unchanged +- Error logging pattern unchanged diff --git a/.claude/skills/use-local-whisper/tests/use-local-whisper.test.ts b/.claude/skills/use-local-whisper/tests/use-local-whisper.test.ts new file mode 100644 index 0000000..580d44f --- /dev/null +++ b/.claude/skills/use-local-whisper/tests/use-local-whisper.test.ts @@ -0,0 +1,115 @@ +import { describe, expect, it } from 'vitest'; +import fs from 'fs'; +import path from 'path'; + +describe('use-local-whisper skill package', () => { + const skillDir = path.resolve(__dirname, '..'); + + it('has a valid manifest', () => { + const manifestPath = path.join(skillDir, 'manifest.yaml'); + expect(fs.existsSync(manifestPath)).toBe(true); + + const content = fs.readFileSync(manifestPath, 'utf-8'); + expect(content).toContain('skill: use-local-whisper'); + expect(content).toContain('version: 1.0.0'); + expect(content).toContain('src/transcription.ts'); + expect(content).toContain('voice-transcription'); + }); + + it('declares voice-transcription as a dependency', () => { + const content = fs.readFileSync( + path.join(skillDir, 'manifest.yaml'), + 'utf-8', + ); + expect(content).toContain('depends:'); + expect(content).toContain('voice-transcription'); + }); + + it('has no structured operations (no new npm deps needed)', () => { + const content = fs.readFileSync( + path.join(skillDir, 'manifest.yaml'), + 'utf-8', + ); + expect(content).toContain('structured: {}'); + }); + + it('has the modified transcription file', () => { + const filePath = path.join(skillDir, 'modify', 'src', 'transcription.ts'); + expect(fs.existsSync(filePath)).toBe(true); + }); + + it('has an intent file for the modified file', () => { + const intentPath = path.join(skillDir, 'modify', 'src', 'transcription.ts.intent.md'); + expect(fs.existsSync(intentPath)).toBe(true); + + const content = fs.readFileSync(intentPath, 'utf-8'); + expect(content).toContain('whisper.cpp'); + expect(content).toContain('transcribeAudioMessage'); + expect(content).toContain('isVoiceMessage'); + expect(content).toContain('Invariants'); + }); + + it('uses whisper-cli (not OpenAI) for transcription', () => { + const content = fs.readFileSync( + path.join(skillDir, 'modify', 'src', 'transcription.ts'), + 'utf-8', + ); + + // Uses local whisper.cpp CLI + expect(content).toContain('whisper-cli'); + expect(content).toContain('execFileAsync'); + expect(content).toContain('WHISPER_BIN'); + expect(content).toContain('WHISPER_MODEL'); + expect(content).toContain('ggml-base.bin'); + + // Does NOT use OpenAI + expect(content).not.toContain('openai'); + expect(content).not.toContain('OpenAI'); + expect(content).not.toContain('OPENAI_API_KEY'); + expect(content).not.toContain('readEnvFile'); + }); + + it('preserves the public API (transcribeAudioMessage and isVoiceMessage)', () => { + const content = fs.readFileSync( + path.join(skillDir, 'modify', 'src', 'transcription.ts'), + 'utf-8', + ); + + expect(content).toContain('export async function transcribeAudioMessage('); + expect(content).toContain('msg: WAMessage'); + expect(content).toContain('sock: WASocket'); + expect(content).toContain('Promise'); + expect(content).toContain('export function isVoiceMessage('); + expect(content).toContain('downloadMediaMessage'); + }); + + it('preserves fallback message strings', () => { + const content = fs.readFileSync( + path.join(skillDir, 'modify', 'src', 'transcription.ts'), + 'utf-8', + ); + + expect(content).toContain('[Voice Message - transcription unavailable]'); + }); + + it('includes ffmpeg conversion step', () => { + const content = fs.readFileSync( + path.join(skillDir, 'modify', 'src', 'transcription.ts'), + 'utf-8', + ); + + expect(content).toContain('ffmpeg'); + expect(content).toContain("'-ar', '16000'"); + expect(content).toContain("'-ac', '1'"); + }); + + it('cleans up temp files in finally block', () => { + const content = fs.readFileSync( + path.join(skillDir, 'modify', 'src', 'transcription.ts'), + 'utf-8', + ); + + expect(content).toContain('finally'); + expect(content).toContain('unlinkSync'); + }); +});