Files
OpenSquawk/server/api/atc/ptt.post.ts
2025-09-20 09:46:34 +02:00

208 lines
6.7 KiB
TypeScript

// server/api/atc/ptt.post.ts
import { createError, readBody } from "h3";
import { writeFile, rm } from "node:fs/promises";
import { join } from "node:path";
import { tmpdir } from "node:os";
import { randomUUID } from "node:crypto";
import { execFile } from "node:child_process";
import { getOpenAIClient, routeDecision, type LLMDecisionResult } from "../../utils/openai";
import { createReadStream } from "node:fs";
import { TransmissionLog } from "../../models/TransmissionLog";
import { getUserFromEvent } from "../../utils/auth";
type AudioFormat = 'wav' | 'mp3' | 'ogg' | 'webm'
interface PTTRequest {
audio: string; // Base64 encoded audio
context: {
state_id: string;
state: any;
candidates: Array<{ id: string; state: any }>;
variables: Record<string, any>;
flags: Record<string, any>;
};
moduleId: string;
lessonId: string;
format?: AudioFormat;
autoDecide?: boolean;
}
interface PTTResponse {
success: boolean;
transcription: string;
decision?: {
next_state: string;
controller_say_tpl?: string;
off_schema?: boolean;
radio_check?: boolean;
};
}
async function sh(cmd: string, args: string[]) {
return new Promise<{ stdout: string; stderr: string }>((res, rej) =>
execFile(cmd, args, { encoding: 'utf8' }, (err, stdout, stderr) =>
err ? rej(new Error(stderr || String(err))) : res({ stdout, stderr })
)
);
}
const BASE64_AUDIO_REGEX = /^[A-Za-z0-9+/]+={0,2}$/;
const MAX_AUDIO_BYTES = 2 * 1024 * 1024; // ~60 Sekunden 16kHz Mono
const ALLOWED_AUDIO_FORMATS: AudioFormat[] = ['wav', 'mp3', 'ogg', 'webm'];
const AUDIO_FORMAT_SET = new Set<AudioFormat>(ALLOWED_AUDIO_FORMATS);
function resolveAudioFormat(format?: string | null): AudioFormat {
if (!format) {
return 'wav';
}
const normalized = format.trim().toLowerCase() as AudioFormat;
return AUDIO_FORMAT_SET.has(normalized) ? normalized : 'wav';
}
function decodeAudioPayload(encoded: string): Buffer {
const sanitized = encoded.replace(/\s+/g, '');
if (!sanitized) {
throw createError({ statusCode: 400, statusMessage: 'Audio payload is empty' });
}
if (!BASE64_AUDIO_REGEX.test(sanitized)) {
throw createError({ statusCode: 400, statusMessage: 'Audio payload is not valid base64' });
}
const buffer = Buffer.from(sanitized, 'base64');
if (!buffer.length) {
throw createError({ statusCode: 400, statusMessage: 'Decoded audio payload is empty' });
}
if (buffer.length > MAX_AUDIO_BYTES) {
throw createError({ statusCode: 413, statusMessage: 'Audio payload exceeds the 2 MB limit' });
}
return buffer;
}
// Convert audio to WAV for better Whisper compatibility
async function convertToWav(inputPath: string, outputPath: string) {
await sh("ffmpeg", [
"-y", "-i", inputPath,
"-ar", "16000", // 16 kHz for Whisper
"-ac", "1", // Mono
"-f", "wav",
outputPath
]);
}
export default defineEventHandler(async (event) => {
const body = await readBody<PTTRequest>(event);
if (!body.audio || !body.context || !body.moduleId || !body.lessonId) {
throw createError({
statusCode: 400,
statusMessage: "audio, context, moduleId, and lessonId are required"
});
}
const id = randomUUID();
const format = resolveAudioFormat(body.format);
const tmpAudioInput = join(tmpdir(), `ptt-input-${id}.${format}`);
const tmpAudioWav = join(tmpdir(), `ptt-wav-${id}.wav`);
try {
// 1. Decode audio from base64 and save
const audioBuffer = decodeAudioPayload(body.audio);
await writeFile(tmpAudioInput, audioBuffer);
// 2. Convert to WAV if needed (only when FFmpeg is available)
let audioFileForWhisper = tmpAudioInput;
if (format !== 'wav') {
try {
await convertToWav(tmpAudioInput, tmpAudioWav);
audioFileForWhisper = tmpAudioWav;
} catch (err) {
console.warn('FFmpeg conversion failed, using original audio:', err);
}
}
// 3. OpenAI Whisper for transcription
const openai = getOpenAIClient();
const transcription = await openai.audio.transcriptions.create({
file: createReadStream(audioFileForWhisper),
model: "whisper-1",
language: "en",
prompt: "This is ATC radio communication with aviation phraseology including callsigns, runway numbers, and standard procedures."
});
const transcribedText = transcription.text.trim();
if (!transcribedText) {
throw createError({
statusCode: 400,
statusMessage: "No speech detected in audio"
});
}
const shouldAutoDecide = body.autoDecide !== false;
let decisionResult: LLMDecisionResult | null = null;
let decision: PTTResponse['decision'];
if (shouldAutoDecide) {
// 4. Call the LLM decision directly with the transcribed text
const decisionInput = {
...body.context,
pilot_utterance: transcribedText
};
decisionResult = await routeDecision(decisionInput);
decision = decisionResult.decision;
}
// 5. Cleanup
await rm(tmpAudioInput).catch(() => {});
if (audioFileForWhisper !== tmpAudioInput) {
await rm(tmpAudioWav).catch(() => {});
}
try {
const user = await getUserFromEvent(event)
await TransmissionLog.create({
user: user?._id,
role: "pilot",
channel: "ptt",
direction: "incoming",
text: transcribedText,
metadata: {
moduleId: body.moduleId,
lessonId: body.lessonId,
decision,
decisionTrace: decisionResult?.trace,
autoDecide: shouldAutoDecide,
},
})
} catch (logError) {
console.warn("Transmission logging failed", logError)
}
const result: PTTResponse = {
success: true,
transcription: transcribedText
};
if (decision) {
result.decision = decision;
}
return result;
} catch (error: any) {
// Cleanup on error
await rm(tmpAudioInput).catch(() => {});
await rm(tmpAudioWav).catch(() => {});
if (error.statusCode) {
throw error;
}
throw createError({
statusCode: 500,
statusMessage: `PTT processing failed: ${error.message || error}`
});
}
});