feat(skills): add use-local-whisper skill package (#702)

Thanks for the great contribution @glifocat! This is a really well-structured skill — clean package, thorough docs, and solid test coverage. Hope to see more skills like this from you!
2026-03-04 14:56:31 +01:00
parent 5e3d8b6c2c
commit 03f792bfce
7 changed files with 394 additions and 0 deletions
--- a/.claude/skills/use-local-whisper/modify/src/transcription.ts
+++ b/.claude/skills/use-local-whisper/modify/src/transcription.ts
@@ -0,0 +1,95 @@
+import { execFile } from 'child_process';
+import fs from 'fs';
+import os from 'os';
+import path from 'path';
+import { promisify } from 'util';
+
+import { downloadMediaMessage, WAMessage, WASocket } from '@whiskeysockets/baileys';
+
+const execFileAsync = promisify(execFile);
+
+const WHISPER_BIN = process.env.WHISPER_BIN || 'whisper-cli';
+const WHISPER_MODEL =
+  process.env.WHISPER_MODEL ||
+  path.join(process.cwd(), 'data', 'models', 'ggml-base.bin');
+
+const FALLBACK_MESSAGE = '[Voice Message - transcription unavailable]';
+
+async function transcribeWithWhisperCpp(
+  audioBuffer: Buffer,
+): Promise<string | null> {
+  const tmpDir = os.tmpdir();
+  const id = `nanoclaw-voice-${Date.now()}`;
+  const tmpOgg = path.join(tmpDir, `${id}.ogg`);
+  const tmpWav = path.join(tmpDir, `${id}.wav`);
+
+  try {
+    fs.writeFileSync(tmpOgg, audioBuffer);
+
+    // Convert ogg/opus to 16kHz mono WAV (required by whisper.cpp)
+    await execFileAsync('ffmpeg', [
+      '-i', tmpOgg,
+      '-ar', '16000',
+      '-ac', '1',
+      '-f', 'wav',
+      '-y', tmpWav,
+    ], { timeout: 30_000 });
+
+    const { stdout } = await execFileAsync(WHISPER_BIN, [
+      '-m', WHISPER_MODEL,
+      '-f', tmpWav,
+      '--no-timestamps',
+      '-nt',
+    ], { timeout: 60_000 });
+
+    const transcript = stdout.trim();
+    return transcript || null;
+  } catch (err) {
+    console.error('whisper.cpp transcription failed:', err);
+    return null;
+  } finally {
+    for (const f of [tmpOgg, tmpWav]) {
+      try { fs.unlinkSync(f); } catch { /* best effort cleanup */ }
+    }
+  }
+}
+
+export async function transcribeAudioMessage(
+  msg: WAMessage,
+  sock: WASocket,
+): Promise<string | null> {
+  try {
+    const buffer = (await downloadMediaMessage(
+      msg,
+      'buffer',
+      {},
+      {
+        logger: console as any,
+        reuploadRequest: sock.updateMediaMessage,
+      },
+    )) as Buffer;
+
+    if (!buffer || buffer.length === 0) {
+      console.error('Failed to download audio message');
+      return FALLBACK_MESSAGE;
+    }
+
+    console.log(`Downloaded audio message: ${buffer.length} bytes`);
+
+    const transcript = await transcribeWithWhisperCpp(buffer);
+
+    if (!transcript) {
+      return FALLBACK_MESSAGE;
+    }
+
+    console.log(`Transcribed voice message: ${transcript.length} chars`);
+    return transcript.trim();
+  } catch (err) {
+    console.error('Transcription error:', err);
+    return FALLBACK_MESSAGE;
+  }
+}
+
+export function isVoiceMessage(msg: WAMessage): boolean {
+  return msg.message?.audioMessage?.ptt === true;
+}
--- a/.claude/skills/use-local-whisper/modify/src/transcription.ts.intent.md
+++ b/.claude/skills/use-local-whisper/modify/src/transcription.ts.intent.md
@@ -0,0 +1,39 @@
+# Intent: src/transcription.ts modifications
+
+## What changed
+Replaced the OpenAI Whisper API backend with local whisper.cpp CLI execution. Audio is converted from ogg/opus to 16kHz mono WAV via ffmpeg, then transcribed locally using whisper-cpp. No API key or network required.
+
+## Key sections
+
+### Imports
+- Removed: `readEnvFile` from `./env.js` (no API key needed)
+- Added: `execFile` from `child_process`, `fs`, `os`, `path`, `promisify` from `util`
+
+### Configuration
+- Removed: `TranscriptionConfig` interface and `DEFAULT_CONFIG` (no model/enabled/fallback config)
+- Added: `WHISPER_BIN` constant (env `WHISPER_BIN` or `'whisper-cli'`)
+- Added: `WHISPER_MODEL` constant (env `WHISPER_MODEL` or `data/models/ggml-base.bin`)
+- Added: `FALLBACK_MESSAGE` constant
+
+### transcribeWithWhisperCpp (replaces transcribeWithOpenAI)
+- Writes audio buffer to temp .ogg file
+- Converts to 16kHz mono WAV via ffmpeg
+- Runs whisper-cpp CLI with `--no-timestamps -nt` flags
+- Cleans up temp files in finally block
+- Returns trimmed stdout or null on error
+
+### transcribeAudioMessage
+- Same signature: `(msg: WAMessage, sock: WASocket) => Promise<string | null>`
+- Same download logic via `downloadMediaMessage`
+- Calls `transcribeWithWhisperCpp` instead of `transcribeWithOpenAI`
+- Same fallback behavior on error/null
+
+### isVoiceMessage
+- Unchanged: `msg.message?.audioMessage?.ptt === true`
+
+## Invariants (must-keep)
+- `transcribeAudioMessage` export signature unchanged
+- `isVoiceMessage` export unchanged
+- Fallback message strings unchanged: `[Voice Message - transcription unavailable]`
+- downloadMediaMessage call pattern unchanged
+- Error logging pattern unchanged