feat: add voice transcription as nanorepo skill (#326)

Add voice transcription skill package at .claude/skills/add-voice-transcription/ so it can be applied via the skills engine. Skill adds src/transcription.ts (OpenAI Whisper), modifies whatsapp.ts to detect/transcribe voice notes, and includes intent files, 3 test cases, and 8 skill validation tests. Also fixes skills engine runNpmInstall() to use --legacy-peer-deps, needed for any skill adding deps with Zod v3 peer requirements. Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
2026-02-20 14:18:54 +02:00
parent 6b9b3a12c9
commit a4072162b7
9 changed files with 1686 additions and 412 deletions
--- a/.claude/skills/add-voice-transcription/add/src/transcription.ts
+++ b/.claude/skills/add-voice-transcription/add/src/transcription.ts
@@ -0,0 +1,98 @@
+import { downloadMediaMessage } from '@whiskeysockets/baileys';
+import { WAMessage, WASocket } from '@whiskeysockets/baileys';
+
+import { readEnvFile } from './env.js';
+
+interface TranscriptionConfig {
+  model: string;
+  enabled: boolean;
+  fallbackMessage: string;
+}
+
+const DEFAULT_CONFIG: TranscriptionConfig = {
+  model: 'whisper-1',
+  enabled: true,
+  fallbackMessage: '[Voice Message - transcription unavailable]',
+};
+
+async function transcribeWithOpenAI(
+  audioBuffer: Buffer,
+  config: TranscriptionConfig,
+): Promise<string | null> {
+  const env = readEnvFile(['OPENAI_API_KEY']);
+  const apiKey = env.OPENAI_API_KEY;
+
+  if (!apiKey) {
+    console.warn('OPENAI_API_KEY not set in .env');
+    return null;
+  }
+
+  try {
+    const openaiModule = await import('openai');
+    const OpenAI = openaiModule.default;
+    const toFile = openaiModule.toFile;
+
+    const openai = new OpenAI({ apiKey });
+
+    const file = await toFile(audioBuffer, 'voice.ogg', {
+      type: 'audio/ogg',
+    });
+
+    const transcription = await openai.audio.transcriptions.create({
+      file: file,
+      model: config.model,
+      response_format: 'text',
+    });
+
+    // When response_format is 'text', the API returns a plain string
+    return transcription as unknown as string;
+  } catch (err) {
+    console.error('OpenAI transcription failed:', err);
+    return null;
+  }
+}
+
+export async function transcribeAudioMessage(
+  msg: WAMessage,
+  sock: WASocket,
+): Promise<string | null> {
+  const config = DEFAULT_CONFIG;
+
+  if (!config.enabled) {
+    return config.fallbackMessage;
+  }
+
+  try {
+    const buffer = (await downloadMediaMessage(
+      msg,
+      'buffer',
+      {},
+      {
+        logger: console as any,
+        reuploadRequest: sock.updateMediaMessage,
+      },
+    )) as Buffer;
+
+    if (!buffer || buffer.length === 0) {
+      console.error('Failed to download audio message');
+      return config.fallbackMessage;
+    }
+
+    console.log(`Downloaded audio message: ${buffer.length} bytes`);
+
+    const transcript = await transcribeWithOpenAI(buffer, config);
+
+    if (!transcript) {
+      return config.fallbackMessage;
+    }
+
+    return transcript.trim();
+  } catch (err) {
+    console.error('Transcription error:', err);
+    return config.fallbackMessage;
+  }
+}
+
+export function isVoiceMessage(msg: WAMessage): boolean {
+  return msg.message?.audioMessage?.ptt === true;
+}