feat(skills): add use-local-whisper skill package (#702)

Thanks for the great contribution @glifocat! This is a really well-structured skill — clean package, thorough docs, and solid test coverage. Hope to see more skills like this from you!
This commit is contained in:
glifocat
2026-03-04 14:56:31 +01:00
committed by GitHub
parent 5e3d8b6c2c
commit 03f792bfce
7 changed files with 394 additions and 0 deletions

View File

@@ -0,0 +1,95 @@
import { execFile } from 'child_process';
import fs from 'fs';
import os from 'os';
import path from 'path';
import { promisify } from 'util';
import { downloadMediaMessage, WAMessage, WASocket } from '@whiskeysockets/baileys';
const execFileAsync = promisify(execFile);
const WHISPER_BIN = process.env.WHISPER_BIN || 'whisper-cli';
const WHISPER_MODEL =
process.env.WHISPER_MODEL ||
path.join(process.cwd(), 'data', 'models', 'ggml-base.bin');
const FALLBACK_MESSAGE = '[Voice Message - transcription unavailable]';
async function transcribeWithWhisperCpp(
audioBuffer: Buffer,
): Promise<string | null> {
const tmpDir = os.tmpdir();
const id = `nanoclaw-voice-${Date.now()}`;
const tmpOgg = path.join(tmpDir, `${id}.ogg`);
const tmpWav = path.join(tmpDir, `${id}.wav`);
try {
fs.writeFileSync(tmpOgg, audioBuffer);
// Convert ogg/opus to 16kHz mono WAV (required by whisper.cpp)
await execFileAsync('ffmpeg', [
'-i', tmpOgg,
'-ar', '16000',
'-ac', '1',
'-f', 'wav',
'-y', tmpWav,
], { timeout: 30_000 });
const { stdout } = await execFileAsync(WHISPER_BIN, [
'-m', WHISPER_MODEL,
'-f', tmpWav,
'--no-timestamps',
'-nt',
], { timeout: 60_000 });
const transcript = stdout.trim();
return transcript || null;
} catch (err) {
console.error('whisper.cpp transcription failed:', err);
return null;
} finally {
for (const f of [tmpOgg, tmpWav]) {
try { fs.unlinkSync(f); } catch { /* best effort cleanup */ }
}
}
}
export async function transcribeAudioMessage(
msg: WAMessage,
sock: WASocket,
): Promise<string | null> {
try {
const buffer = (await downloadMediaMessage(
msg,
'buffer',
{},
{
logger: console as any,
reuploadRequest: sock.updateMediaMessage,
},
)) as Buffer;
if (!buffer || buffer.length === 0) {
console.error('Failed to download audio message');
return FALLBACK_MESSAGE;
}
console.log(`Downloaded audio message: ${buffer.length} bytes`);
const transcript = await transcribeWithWhisperCpp(buffer);
if (!transcript) {
return FALLBACK_MESSAGE;
}
console.log(`Transcribed voice message: ${transcript.length} chars`);
return transcript.trim();
} catch (err) {
console.error('Transcription error:', err);
return FALLBACK_MESSAGE;
}
}
export function isVoiceMessage(msg: WAMessage): boolean {
return msg.message?.audioMessage?.ptt === true;
}

View File

@@ -0,0 +1,39 @@
# Intent: src/transcription.ts modifications
## What changed
Replaced the OpenAI Whisper API backend with local whisper.cpp CLI execution. Audio is converted from ogg/opus to 16kHz mono WAV via ffmpeg, then transcribed locally using whisper-cpp. No API key or network required.
## Key sections
### Imports
- Removed: `readEnvFile` from `./env.js` (no API key needed)
- Added: `execFile` from `child_process`, `fs`, `os`, `path`, `promisify` from `util`
### Configuration
- Removed: `TranscriptionConfig` interface and `DEFAULT_CONFIG` (no model/enabled/fallback config)
- Added: `WHISPER_BIN` constant (env `WHISPER_BIN` or `'whisper-cli'`)
- Added: `WHISPER_MODEL` constant (env `WHISPER_MODEL` or `data/models/ggml-base.bin`)
- Added: `FALLBACK_MESSAGE` constant
### transcribeWithWhisperCpp (replaces transcribeWithOpenAI)
- Writes audio buffer to temp .ogg file
- Converts to 16kHz mono WAV via ffmpeg
- Runs whisper-cpp CLI with `--no-timestamps -nt` flags
- Cleans up temp files in finally block
- Returns trimmed stdout or null on error
### transcribeAudioMessage
- Same signature: `(msg: WAMessage, sock: WASocket) => Promise<string | null>`
- Same download logic via `downloadMediaMessage`
- Calls `transcribeWithWhisperCpp` instead of `transcribeWithOpenAI`
- Same fallback behavior on error/null
### isVoiceMessage
- Unchanged: `msg.message?.audioMessage?.ptt === true`
## Invariants (must-keep)
- `transcribeAudioMessage` export signature unchanged
- `isVoiceMessage` export unchanged
- Fallback message strings unchanged: `[Voice Message - transcription unavailable]`
- downloadMediaMessage call pattern unchanged
- Error logging pattern unchanged