feat(skills): add use-local-whisper skill package (#702)

Thanks for the great contribution @glifocat! This is a really well-structured skill — clean package, thorough docs, and solid test coverage. Hope to see more skills like this from you!
2026-03-04 14:56:31 +01:00
parent 5e3d8b6c2c
commit 03f792bfce
7 changed files with 394 additions and 0 deletions
--- a/.claude/skills/use-local-whisper/SKILL.md
+++ b/.claude/skills/use-local-whisper/SKILL.md
@@ -0,0 +1,128 @@
+---
+name: use-local-whisper
+description: Use when the user wants local voice transcription instead of OpenAI Whisper API. Switches to whisper.cpp running on Apple Silicon. WhatsApp only for now. Requires voice-transcription skill to be applied first.
+---
+
+# Use Local Whisper
+
+Switches voice transcription from OpenAI's Whisper API to local whisper.cpp. Runs entirely on-device — no API key, no network, no cost.
+
+**Channel support:** Currently WhatsApp only. The transcription module (`src/transcription.ts`) uses Baileys types for audio download. Other channels (Telegram, Discord, etc.) would need their own audio-download logic before this skill can serve them.
+
+**Note:** The Homebrew package is `whisper-cpp`, but the CLI binary it installs is `whisper-cli`.
+
+## Prerequisites
+
+- `voice-transcription` skill must be applied first (WhatsApp channel)
+- macOS with Apple Silicon (M1+) recommended
+- `whisper-cpp` installed: `brew install whisper-cpp` (provides the `whisper-cli` binary)
+- `ffmpeg` installed: `brew install ffmpeg`
+- A GGML model file downloaded to `data/models/`
+
+## Phase 1: Pre-flight
+
+### Check if already applied
+
+Read `.nanoclaw/state.yaml`. If `use-local-whisper` is in `applied_skills`, skip to Phase 3 (Verify).
+
+### Check dependencies are installed
+
+```bash
+whisper-cli --help >/dev/null 2>&1 && echo "WHISPER_OK" || echo "WHISPER_MISSING"
+ffmpeg -version >/dev/null 2>&1 && echo "FFMPEG_OK" || echo "FFMPEG_MISSING"
+```
+
+If missing, install via Homebrew:
+```bash
+brew install whisper-cpp ffmpeg
+```
+
+### Check for model file
+
+```bash
+ls data/models/ggml-*.bin 2>/dev/null || echo "NO_MODEL"
+```
+
+If no model exists, download the base model (148MB, good balance of speed and accuracy):
+```bash
+mkdir -p data/models
+curl -L -o data/models/ggml-base.bin "https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-base.bin"
+```
+
+For better accuracy at the cost of speed, use `ggml-small.bin` (466MB) or `ggml-medium.bin` (1.5GB).
+
+## Phase 2: Apply Code Changes
+
+```bash
+npx tsx scripts/apply-skill.ts .claude/skills/use-local-whisper
+```
+
+This modifies `src/transcription.ts` to use the `whisper-cli` binary instead of the OpenAI API.
+
+### Validate
+
+```bash
+npm test
+npm run build
+```
+
+## Phase 3: Verify
+
+### Ensure launchd PATH includes Homebrew
+
+The NanoClaw launchd service runs with a restricted PATH. `whisper-cli` and `ffmpeg` are in `/opt/homebrew/bin/` (Apple Silicon) or `/usr/local/bin/` (Intel), which may not be in the plist's PATH.
+
+Check the current PATH:
+```bash
+grep -A1 'PATH' ~/Library/LaunchAgents/com.nanoclaw.plist
+```
+
+If `/opt/homebrew/bin` is missing, add it to the `<string>` value inside the `PATH` key in the plist. Then reload:
+```bash
+launchctl unload ~/Library/LaunchAgents/com.nanoclaw.plist
+launchctl load ~/Library/LaunchAgents/com.nanoclaw.plist
+```
+
+### Build and restart
+
+```bash
+npm run build
+launchctl kickstart -k gui/$(id -u)/com.nanoclaw
+```
+
+### Test
+
+Send a voice note in any registered group. The agent should receive it as `[Voice: <transcript>]`.
+
+### Check logs
+
+```bash
+tail -f logs/nanoclaw.log | grep -i -E "voice|transcri|whisper"
+```
+
+Look for:
+- `Transcribed voice message` — successful transcription
+- `whisper.cpp transcription failed` — check model path, ffmpeg, or PATH
+
+## Configuration
+
+Environment variables (optional, set in `.env`):
+
+| Variable | Default | Description |
+|----------|---------|-------------|
+| `WHISPER_BIN` | `whisper-cli` | Path to whisper.cpp binary |
+| `WHISPER_MODEL` | `data/models/ggml-base.bin` | Path to GGML model file |
+
+## Troubleshooting
+
+**"whisper.cpp transcription failed"**: Ensure both `whisper-cli` and `ffmpeg` are in PATH. The launchd service uses a restricted PATH — see Phase 3 above. Test manually:
+```bash
+ffmpeg -f lavfi -i anullsrc=r=16000:cl=mono -t 1 -f wav /tmp/test.wav -y
+whisper-cli -m data/models/ggml-base.bin -f /tmp/test.wav --no-timestamps -nt
+```
+
+**Transcription works in dev but not as service**: The launchd plist PATH likely doesn't include `/opt/homebrew/bin`. See "Ensure launchd PATH includes Homebrew" in Phase 3.
+
+**Slow transcription**: The base model processes ~30s of audio in <1s on M1+. If slower, check CPU usage — another process may be competing.
+
+**Wrong language**: whisper.cpp auto-detects language. To force a language, you can set `WHISPER_LANG` and modify `src/transcription.ts` to pass `-l $WHISPER_LANG`.
--- a/.claude/skills/use-local-whisper/manifest.yaml
+++ b/.claude/skills/use-local-whisper/manifest.yaml
@@ -0,0 +1,12 @@
+skill: use-local-whisper
+version: 1.0.0
+description: "Switch voice transcription from OpenAI Whisper API to local whisper.cpp (WhatsApp only)"
+core_version: 0.1.0
+adds: []
+modifies:
+  - src/transcription.ts
+structured: {}
+conflicts: []
+depends:
+  - voice-transcription
+test: "npx vitest run src/channels/whatsapp.test.ts"
--- a/.claude/skills/use-local-whisper/modify/src/transcription.ts
+++ b/.claude/skills/use-local-whisper/modify/src/transcription.ts
@@ -0,0 +1,95 @@
+import { execFile } from 'child_process';
+import fs from 'fs';
+import os from 'os';
+import path from 'path';
+import { promisify } from 'util';
+
+import { downloadMediaMessage, WAMessage, WASocket } from '@whiskeysockets/baileys';
+
+const execFileAsync = promisify(execFile);
+
+const WHISPER_BIN = process.env.WHISPER_BIN || 'whisper-cli';
+const WHISPER_MODEL =
+  process.env.WHISPER_MODEL ||
+  path.join(process.cwd(), 'data', 'models', 'ggml-base.bin');
+
+const FALLBACK_MESSAGE = '[Voice Message - transcription unavailable]';
+
+async function transcribeWithWhisperCpp(
+  audioBuffer: Buffer,
+): Promise<string | null> {
+  const tmpDir = os.tmpdir();
+  const id = `nanoclaw-voice-${Date.now()}`;
+  const tmpOgg = path.join(tmpDir, `${id}.ogg`);
+  const tmpWav = path.join(tmpDir, `${id}.wav`);
+
+  try {
+    fs.writeFileSync(tmpOgg, audioBuffer);
+
+    // Convert ogg/opus to 16kHz mono WAV (required by whisper.cpp)
+    await execFileAsync('ffmpeg', [
+      '-i', tmpOgg,
+      '-ar', '16000',
+      '-ac', '1',
+      '-f', 'wav',
+      '-y', tmpWav,
+    ], { timeout: 30_000 });
+
+    const { stdout } = await execFileAsync(WHISPER_BIN, [
+      '-m', WHISPER_MODEL,
+      '-f', tmpWav,
+      '--no-timestamps',
+      '-nt',
+    ], { timeout: 60_000 });
+
+    const transcript = stdout.trim();
+    return transcript || null;
+  } catch (err) {
+    console.error('whisper.cpp transcription failed:', err);
+    return null;
+  } finally {
+    for (const f of [tmpOgg, tmpWav]) {
+      try { fs.unlinkSync(f); } catch { /* best effort cleanup */ }
+    }
+  }
+}
+
+export async function transcribeAudioMessage(
+  msg: WAMessage,
+  sock: WASocket,
+): Promise<string | null> {
+  try {
+    const buffer = (await downloadMediaMessage(
+      msg,
+      'buffer',
+      {},
+      {
+        logger: console as any,
+        reuploadRequest: sock.updateMediaMessage,
+      },
+    )) as Buffer;
+
+    if (!buffer || buffer.length === 0) {
+      console.error('Failed to download audio message');
+      return FALLBACK_MESSAGE;
+    }
+
+    console.log(`Downloaded audio message: ${buffer.length} bytes`);
+
+    const transcript = await transcribeWithWhisperCpp(buffer);
+
+    if (!transcript) {
+      return FALLBACK_MESSAGE;
+    }
+
+    console.log(`Transcribed voice message: ${transcript.length} chars`);
+    return transcript.trim();
+  } catch (err) {
+    console.error('Transcription error:', err);
+    return FALLBACK_MESSAGE;
+  }
+}
+
+export function isVoiceMessage(msg: WAMessage): boolean {
+  return msg.message?.audioMessage?.ptt === true;
+}
--- a/.claude/skills/use-local-whisper/modify/src/transcription.ts.intent.md
+++ b/.claude/skills/use-local-whisper/modify/src/transcription.ts.intent.md
@@ -0,0 +1,39 @@
+# Intent: src/transcription.ts modifications
+
+## What changed
+Replaced the OpenAI Whisper API backend with local whisper.cpp CLI execution. Audio is converted from ogg/opus to 16kHz mono WAV via ffmpeg, then transcribed locally using whisper-cpp. No API key or network required.
+
+## Key sections
+
+### Imports
+- Removed: `readEnvFile` from `./env.js` (no API key needed)
+- Added: `execFile` from `child_process`, `fs`, `os`, `path`, `promisify` from `util`
+
+### Configuration
+- Removed: `TranscriptionConfig` interface and `DEFAULT_CONFIG` (no model/enabled/fallback config)
+- Added: `WHISPER_BIN` constant (env `WHISPER_BIN` or `'whisper-cli'`)
+- Added: `WHISPER_MODEL` constant (env `WHISPER_MODEL` or `data/models/ggml-base.bin`)
+- Added: `FALLBACK_MESSAGE` constant
+
+### transcribeWithWhisperCpp (replaces transcribeWithOpenAI)
+- Writes audio buffer to temp .ogg file
+- Converts to 16kHz mono WAV via ffmpeg
+- Runs whisper-cpp CLI with `--no-timestamps -nt` flags
+- Cleans up temp files in finally block
+- Returns trimmed stdout or null on error
+
+### transcribeAudioMessage
+- Same signature: `(msg: WAMessage, sock: WASocket) => Promise<string | null>`
+- Same download logic via `downloadMediaMessage`
+- Calls `transcribeWithWhisperCpp` instead of `transcribeWithOpenAI`
+- Same fallback behavior on error/null
+
+### isVoiceMessage
+- Unchanged: `msg.message?.audioMessage?.ptt === true`
+
+## Invariants (must-keep)
+- `transcribeAudioMessage` export signature unchanged
+- `isVoiceMessage` export unchanged
+- Fallback message strings unchanged: `[Voice Message - transcription unavailable]`
+- downloadMediaMessage call pattern unchanged
+- Error logging pattern unchanged
--- a/.claude/skills/use-local-whisper/tests/use-local-whisper.test.ts
+++ b/.claude/skills/use-local-whisper/tests/use-local-whisper.test.ts
@@ -0,0 +1,115 @@
+import { describe, expect, it } from 'vitest';
+import fs from 'fs';
+import path from 'path';
+
+describe('use-local-whisper skill package', () => {
+  const skillDir = path.resolve(__dirname, '..');
+
+  it('has a valid manifest', () => {
+    const manifestPath = path.join(skillDir, 'manifest.yaml');
+    expect(fs.existsSync(manifestPath)).toBe(true);
+
+    const content = fs.readFileSync(manifestPath, 'utf-8');
+    expect(content).toContain('skill: use-local-whisper');
+    expect(content).toContain('version: 1.0.0');
+    expect(content).toContain('src/transcription.ts');
+    expect(content).toContain('voice-transcription');
+  });
+
+  it('declares voice-transcription as a dependency', () => {
+    const content = fs.readFileSync(
+      path.join(skillDir, 'manifest.yaml'),
+      'utf-8',
+    );
+    expect(content).toContain('depends:');
+    expect(content).toContain('voice-transcription');
+  });
+
+  it('has no structured operations (no new npm deps needed)', () => {
+    const content = fs.readFileSync(
+      path.join(skillDir, 'manifest.yaml'),
+      'utf-8',
+    );
+    expect(content).toContain('structured: {}');
+  });
+
+  it('has the modified transcription file', () => {
+    const filePath = path.join(skillDir, 'modify', 'src', 'transcription.ts');
+    expect(fs.existsSync(filePath)).toBe(true);
+  });
+
+  it('has an intent file for the modified file', () => {
+    const intentPath = path.join(skillDir, 'modify', 'src', 'transcription.ts.intent.md');
+    expect(fs.existsSync(intentPath)).toBe(true);
+
+    const content = fs.readFileSync(intentPath, 'utf-8');
+    expect(content).toContain('whisper.cpp');
+    expect(content).toContain('transcribeAudioMessage');
+    expect(content).toContain('isVoiceMessage');
+    expect(content).toContain('Invariants');
+  });
+
+  it('uses whisper-cli (not OpenAI) for transcription', () => {
+    const content = fs.readFileSync(
+      path.join(skillDir, 'modify', 'src', 'transcription.ts'),
+      'utf-8',
+    );
+
+    // Uses local whisper.cpp CLI
+    expect(content).toContain('whisper-cli');
+    expect(content).toContain('execFileAsync');
+    expect(content).toContain('WHISPER_BIN');
+    expect(content).toContain('WHISPER_MODEL');
+    expect(content).toContain('ggml-base.bin');
+
+    // Does NOT use OpenAI
+    expect(content).not.toContain('openai');
+    expect(content).not.toContain('OpenAI');
+    expect(content).not.toContain('OPENAI_API_KEY');
+    expect(content).not.toContain('readEnvFile');
+  });
+
+  it('preserves the public API (transcribeAudioMessage and isVoiceMessage)', () => {
+    const content = fs.readFileSync(
+      path.join(skillDir, 'modify', 'src', 'transcription.ts'),
+      'utf-8',
+    );
+
+    expect(content).toContain('export async function transcribeAudioMessage(');
+    expect(content).toContain('msg: WAMessage');
+    expect(content).toContain('sock: WASocket');
+    expect(content).toContain('Promise<string | null>');
+    expect(content).toContain('export function isVoiceMessage(');
+    expect(content).toContain('downloadMediaMessage');
+  });
+
+  it('preserves fallback message strings', () => {
+    const content = fs.readFileSync(
+      path.join(skillDir, 'modify', 'src', 'transcription.ts'),
+      'utf-8',
+    );
+
+    expect(content).toContain('[Voice Message - transcription unavailable]');
+  });
+
+  it('includes ffmpeg conversion step', () => {
+    const content = fs.readFileSync(
+      path.join(skillDir, 'modify', 'src', 'transcription.ts'),
+      'utf-8',
+    );
+
+    expect(content).toContain('ffmpeg');
+    expect(content).toContain("'-ar', '16000'");
+    expect(content).toContain("'-ac', '1'");
+  });
+
+  it('cleans up temp files in finally block', () => {
+    const content = fs.readFileSync(
+      path.join(skillDir, 'modify', 'src', 'transcription.ts'),
+      'utf-8',
+    );
+
+    expect(content).toContain('finally');
+    expect(content).toContain('unlinkSync');
+  });
+});