feat(skills): add use-local-whisper skill package (#702)
Thanks for the great contribution @glifocat! This is a really well-structured skill — clean package, thorough docs, and solid test coverage. Hope to see more skills like this from you!
This commit is contained in:
95
.claude/skills/use-local-whisper/modify/src/transcription.ts
Normal file
95
.claude/skills/use-local-whisper/modify/src/transcription.ts
Normal file
@@ -0,0 +1,95 @@
|
||||
import { execFile } from 'child_process';
|
||||
import fs from 'fs';
|
||||
import os from 'os';
|
||||
import path from 'path';
|
||||
import { promisify } from 'util';
|
||||
|
||||
import { downloadMediaMessage, WAMessage, WASocket } from '@whiskeysockets/baileys';
|
||||
|
||||
const execFileAsync = promisify(execFile);
|
||||
|
||||
const WHISPER_BIN = process.env.WHISPER_BIN || 'whisper-cli';
|
||||
const WHISPER_MODEL =
|
||||
process.env.WHISPER_MODEL ||
|
||||
path.join(process.cwd(), 'data', 'models', 'ggml-base.bin');
|
||||
|
||||
const FALLBACK_MESSAGE = '[Voice Message - transcription unavailable]';
|
||||
|
||||
async function transcribeWithWhisperCpp(
|
||||
audioBuffer: Buffer,
|
||||
): Promise<string | null> {
|
||||
const tmpDir = os.tmpdir();
|
||||
const id = `nanoclaw-voice-${Date.now()}`;
|
||||
const tmpOgg = path.join(tmpDir, `${id}.ogg`);
|
||||
const tmpWav = path.join(tmpDir, `${id}.wav`);
|
||||
|
||||
try {
|
||||
fs.writeFileSync(tmpOgg, audioBuffer);
|
||||
|
||||
// Convert ogg/opus to 16kHz mono WAV (required by whisper.cpp)
|
||||
await execFileAsync('ffmpeg', [
|
||||
'-i', tmpOgg,
|
||||
'-ar', '16000',
|
||||
'-ac', '1',
|
||||
'-f', 'wav',
|
||||
'-y', tmpWav,
|
||||
], { timeout: 30_000 });
|
||||
|
||||
const { stdout } = await execFileAsync(WHISPER_BIN, [
|
||||
'-m', WHISPER_MODEL,
|
||||
'-f', tmpWav,
|
||||
'--no-timestamps',
|
||||
'-nt',
|
||||
], { timeout: 60_000 });
|
||||
|
||||
const transcript = stdout.trim();
|
||||
return transcript || null;
|
||||
} catch (err) {
|
||||
console.error('whisper.cpp transcription failed:', err);
|
||||
return null;
|
||||
} finally {
|
||||
for (const f of [tmpOgg, tmpWav]) {
|
||||
try { fs.unlinkSync(f); } catch { /* best effort cleanup */ }
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
export async function transcribeAudioMessage(
|
||||
msg: WAMessage,
|
||||
sock: WASocket,
|
||||
): Promise<string | null> {
|
||||
try {
|
||||
const buffer = (await downloadMediaMessage(
|
||||
msg,
|
||||
'buffer',
|
||||
{},
|
||||
{
|
||||
logger: console as any,
|
||||
reuploadRequest: sock.updateMediaMessage,
|
||||
},
|
||||
)) as Buffer;
|
||||
|
||||
if (!buffer || buffer.length === 0) {
|
||||
console.error('Failed to download audio message');
|
||||
return FALLBACK_MESSAGE;
|
||||
}
|
||||
|
||||
console.log(`Downloaded audio message: ${buffer.length} bytes`);
|
||||
|
||||
const transcript = await transcribeWithWhisperCpp(buffer);
|
||||
|
||||
if (!transcript) {
|
||||
return FALLBACK_MESSAGE;
|
||||
}
|
||||
|
||||
console.log(`Transcribed voice message: ${transcript.length} chars`);
|
||||
return transcript.trim();
|
||||
} catch (err) {
|
||||
console.error('Transcription error:', err);
|
||||
return FALLBACK_MESSAGE;
|
||||
}
|
||||
}
|
||||
|
||||
export function isVoiceMessage(msg: WAMessage): boolean {
|
||||
return msg.message?.audioMessage?.ptt === true;
|
||||
}
|
||||
@@ -0,0 +1,39 @@
|
||||
# Intent: src/transcription.ts modifications
|
||||
|
||||
## What changed
|
||||
Replaced the OpenAI Whisper API backend with local whisper.cpp CLI execution. Audio is converted from ogg/opus to 16kHz mono WAV via ffmpeg, then transcribed locally using whisper-cpp. No API key or network required.
|
||||
|
||||
## Key sections
|
||||
|
||||
### Imports
|
||||
- Removed: `readEnvFile` from `./env.js` (no API key needed)
|
||||
- Added: `execFile` from `child_process`, `fs`, `os`, `path`, `promisify` from `util`
|
||||
|
||||
### Configuration
|
||||
- Removed: `TranscriptionConfig` interface and `DEFAULT_CONFIG` (no model/enabled/fallback config)
|
||||
- Added: `WHISPER_BIN` constant (env `WHISPER_BIN` or `'whisper-cli'`)
|
||||
- Added: `WHISPER_MODEL` constant (env `WHISPER_MODEL` or `data/models/ggml-base.bin`)
|
||||
- Added: `FALLBACK_MESSAGE` constant
|
||||
|
||||
### transcribeWithWhisperCpp (replaces transcribeWithOpenAI)
|
||||
- Writes audio buffer to temp .ogg file
|
||||
- Converts to 16kHz mono WAV via ffmpeg
|
||||
- Runs whisper-cpp CLI with `--no-timestamps -nt` flags
|
||||
- Cleans up temp files in finally block
|
||||
- Returns trimmed stdout or null on error
|
||||
|
||||
### transcribeAudioMessage
|
||||
- Same signature: `(msg: WAMessage, sock: WASocket) => Promise<string | null>`
|
||||
- Same download logic via `downloadMediaMessage`
|
||||
- Calls `transcribeWithWhisperCpp` instead of `transcribeWithOpenAI`
|
||||
- Same fallback behavior on error/null
|
||||
|
||||
### isVoiceMessage
|
||||
- Unchanged: `msg.message?.audioMessage?.ptt === true`
|
||||
|
||||
## Invariants (must-keep)
|
||||
- `transcribeAudioMessage` export signature unchanged
|
||||
- `isVoiceMessage` export unchanged
|
||||
- Fallback message strings unchanged: `[Voice Message - transcription unavailable]`
|
||||
- downloadMediaMessage call pattern unchanged
|
||||
- Error logging pattern unchanged
|
||||
Reference in New Issue
Block a user