blob: b605af222bdb969c1c8ea7e28c697b11795dddb6 [file] [edit]
// Copyright 2025 the V8 project authors. All rights reserved.
// Use of this source code is governed by a BSD-style license that can be
// found in the LICENSE file.
// Convert an WAV audio file to a format that Transformers.js Whisper model can
// work with.
// Write the output as raw float values, so we don't need any additional
// WAV/RIFF parser but can directly use it.
// See https://huggingface.co/docs/transformers.js/guides/node-audio-processing
import wavefile from 'wavefile';
import * as fs from 'fs';
if (process.argv.length < 5) {
throw new Error("Usage: node convert-audio.mjs input.wav output.raw <start_sample> <end_sample>");
}
const inputFilename = process.argv[2];
const outputFilename = process.argv[3];
const outputStartSample = parseInt(process.argv[4]);
const outputEndSample = parseInt(process.argv[5]);
const wav = new wavefile.WaveFile(fs.readFileSync(inputFilename));
const inputChannelCount = wav.fmt.numChannels;
const inputSampleRate = wav.fmt.sampleRate;
const inputBitDepth = wav.fmt.bitsPerSample;
// Pipeline expects input as a Float32Array.
wav.toBitDepth('32f');
// Whisper expects audio with a sampling rate of 16000.
const outputSampleRate = 16000;
wav.toSampleRate(outputSampleRate);
// Explicitly set output type to `Float32Array` to make the raw output smaller.
// (Otherwise, `Float64Array` is the default.)
let audioData = wav.getSamples(/* interleaved */ false, Float32Array);
// Downmix to mono, if necessary.
const outputChannelCount = 1;
if (Array.isArray(audioData)) {
if (audioData.length > 1) {
const SCALING_FACTOR = Math.sqrt(2);
for (let i = 0; i < audioData[0].length; ++i) {
audioData[0][i] = SCALING_FACTOR * (audioData[0][i] + audioData[1][i]) / 2;
}
}
audioData = audioData[0];
}
const inputSampleCount = audioData.length;
// Select samples as specified by outputStartSample and outputEndSample.
console.assert(outputStartSample < outputEndSample, "Given start is smaller than end.");
console.assert(outputEndSample <= audioData.length, "Given end is longer than input file.");
audioData = audioData.slice(outputStartSample, outputEndSample);
const outputSampleCount = audioData.length;
const outputSizeBytes = audioData.byteLength;
fs.writeFileSync(outputFilename, audioData);
const durationSeconds = outputSampleCount / outputSampleRate;
console.log(`Converted ${durationSeconds}s of audio`);
console.log(` from '${inputFilename}', ${inputChannelCount} channel(s), ${inputSampleRate} Hz, ${inputBitDepth} bit, ${inputSampleCount} samples`);
console.log(` to '${outputFilename}', ${outputChannelCount} channel(s), ${outputSampleRate} Hz, 32 bit float, ${outputSampleCount} samples, ${outputSizeBytes} bytes`);