All files / src/services/elevenLabsAPI/textToDialogue index.ts

0% Statements 0/28
0% Branches 0/15
0% Functions 0/3
0% Lines 0/28

Press n or j to go to the next uncovered block, b, p or k for the previous block.

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148                                                                                                                                                                                                                                                                                                       
import z from "zod";
import { ELEVEN_LABS_API_BASE_URL } from "../constants";
 
// Zod schemas (keeping these for validation)
const AllowedOutputFormatsSchema = z.enum([
    "mp3_22050_32",
    "mp3_24000_48",
    "mp3_44100_32",
    "mp3_44100_64",
    "mp3_44100_96",
    "mp3_44100_128",
    "mp3_44100_192",
    "pcm_8000",
    "pcm_16000",
    "pcm_22050",
    "pcm_24000",
    "pcm_32000",
    "pcm_44100",
    "pcm_48000",
    "ulaw_8000",
    "alaw_8000",
    "opus_48000_32",
    "opus_48000_64",
    "opus_48000_96",
    "opus_48000_128",
    "opus_48000_192",
]);
 
const DialogueInputSchema = z.object({
    text: z.string(),
    voiceID: z.string(),
});
 
const ModelSettingsSchema = z.object({
    stability: z.number().nullable().optional().default(0.5),
});
 
const PronunciationDictionaryLocatorSchema = z.object({
    pronunciationDictionaryID: z.string(),
    versionID: z.string().nullable().optional(),
});
 
const ApplyTextNormalizationSchema = z.enum(["auto", "on", "off"]).default("auto");
 
const TextToDialogueOutputSchema = z.object({
    audio: z.string(), // base64 encoded audio
    contentType: z.string(),
});
 
// Types
export type DialogueInput = z.infer<typeof DialogueInputSchema>;
export type ModelSettings = z.infer<typeof ModelSettingsSchema>;
export type PronunciationDictionaryLocator = z.infer<typeof PronunciationDictionaryLocatorSchema>;
export type AllowedOutputFormats = z.infer<typeof AllowedOutputFormatsSchema>;
export type ApplyTextNormalization = z.infer<typeof ApplyTextNormalizationSchema>;
export type TextToDialogueOutput = z.infer<typeof TextToDialogueOutputSchema>;
 
export interface TextToDialogueParams {
    inputs: DialogueInput[];
    modelID?: string;
    languageCode?: string | null;
    settings?: ModelSettings | null;
    pronunciationDictionaryLocators?: PronunciationDictionaryLocator[] | null;
    seed?: number | null;
    applyTextNormalization?: ApplyTextNormalization;
    outputFormat?: AllowedOutputFormats;
}
 
// Data fetching function
export async function textToDialogue(params: TextToDialogueParams): Promise<TextToDialogueOutput> {
    const {
        inputs,
        modelID = "eleven_v3",
        languageCode,
        settings,
        pronunciationDictionaryLocators,
        seed,
        applyTextNormalization,
        outputFormat,
    } = params;
 
    // Build query params
    const queryParams = new URLSearchParams();
    if (outputFormat) {
        queryParams.append("output_format", outputFormat);
    }
 
    const url = `${ELEVEN_LABS_API_BASE_URL}/text-to-dialogue${queryParams.toString() ? `?${queryParams.toString()}` : ""}`;
 
    // Transform camelCase to snake_case for API
    const requestBody = {
        inputs: inputs.map((item) => ({
            text: item.text,
            voice_id: item.voiceID,
        })),
        model_id: modelID,
        language_code: languageCode,
        settings: settings,
        pronunciation_dictionary_locators: pronunciationDictionaryLocators?.map((locator) => ({
            pronunciation_dictionary_id: locator.pronunciationDictionaryID,
            version_id: locator.versionID,
        })),
        seed: seed,
        apply_text_normalization: applyTextNormalization,
    };
 
    const response = await fetch(url, {
        method: "POST",
        headers: {
            "Content-Type": "application/json",
        },
        body: JSON.stringify(requestBody),
    });
 
    if (!response.ok) {
        const errorText = await response.text();
        throw new Error(`ElevenLabs API error (${response.status}): ${errorText}`);
    }
 
    // Get and validate the array buffer
    const arrayBuffer = await response.arrayBuffer();
 
    // Validate content type
    const contentType = response.headers.get("content-type") || "application/octet-stream";
 
    // Validate that we actually received audio data
    if (arrayBuffer.byteLength === 0) {
        throw new Error("Received empty audio response from ElevenLabs API");
    }
 
    // Convert audio to base64
    const uint8Array = new Uint8Array(arrayBuffer);
    const base64 = btoa(String.fromCharCode(...uint8Array));
 
    // Validate base64 string
    if (!base64 || base64.length === 0) {
        throw new Error("Failed to convert audio to base64");
    }
 
    const result = {
        audio: base64,
        contentType,
    };
 
    // Validate output with Zod
    return TextToDialogueOutputSchema.parse(result);
}