diff --git a/.changes/raw-bytes-audio-renderer b/.changes/raw-bytes-audio-renderer new file mode 100644 index 000000000..fe9455d70 --- /dev/null +++ b/.changes/raw-bytes-audio-renderer @@ -0,0 +1 @@ +patch type="performance" "Send raw PCM bytes in audio renderer instead of boxed int arrays" diff --git a/android/src/main/kotlin/io/livekit/plugin/AudioRenderer.kt b/android/src/main/kotlin/io/livekit/plugin/AudioRenderer.kt index 735bb6fef..4404a4008 100644 --- a/android/src/main/kotlin/io/livekit/plugin/AudioRenderer.kt +++ b/android/src/main/kotlin/io/livekit/plugin/AudioRenderer.kt @@ -105,6 +105,14 @@ class AudioRenderer( } } + /** + * Converts audio data to raw interleaved bytes. + * + * If source and target channel counts match, data is copied directly. + * If target requests fewer channels, the first channels are kept and interleaved. + * + * Sends raw byte arrays instead of boxed sample lists. + */ private fun convertAudioData( audioData: ByteBuffer, bitsPerSample: Int, @@ -112,168 +120,138 @@ class AudioRenderer( numberOfChannels: Int, numberOfFrames: Int ): Map { - // Create result similar to iOS implementation + require(bitsPerSample == 16 || bitsPerSample == 32) { + "Unsupported bitsPerSample: $bitsPerSample" + } + require(numberOfChannels > 0) { + "Invalid numberOfChannels: $numberOfChannels" + } + + val outChannels = targetFormat.numberOfChannels.coerceAtMost(numberOfChannels) + val result = mutableMapOf( "sampleRate" to sampleRate, - "channels" to numberOfChannels, - "frameLength" to numberOfFrames + "channels" to outChannels, ) - // Convert based on target format + val buffer = audioData.duplicate() + buffer.order(ByteOrder.LITTLE_ENDIAN) + buffer.rewind() + when (targetFormat.commonFormat) { "int16" -> { result["commonFormat"] = "int16" - result["data"] = - convertToInt16(audioData, bitsPerSample, numberOfChannels, numberOfFrames) + result["data"] = extractAsInt16Bytes(buffer, bitsPerSample, numberOfChannels, outChannels, numberOfFrames) } - "float32" -> { result["commonFormat"] = "float32" - result["data"] = - convertToFloat32(audioData, bitsPerSample, numberOfChannels, numberOfFrames) + result["data"] = extractAsFloat32Bytes(buffer, bitsPerSample, numberOfChannels, outChannels, numberOfFrames) } - else -> { - result["commonFormat"] = "int16" // Default fallback - result["data"] = - convertToInt16(audioData, bitsPerSample, numberOfChannels, numberOfFrames) + result["commonFormat"] = "int16" + result["data"] = extractAsInt16Bytes(buffer, bitsPerSample, numberOfChannels, outChannels, numberOfFrames) } } return result } - private fun convertToInt16( - audioData: ByteBuffer, + private fun extractAsInt16Bytes( + buffer: ByteBuffer, bitsPerSample: Int, - numberOfChannels: Int, + srcChannels: Int, + outChannels: Int, numberOfFrames: Int - ): List> { - val channelsData = mutableListOf>() + ): ByteArray { + // Fast path for int16 with matching channel count. + if (bitsPerSample == 16 && srcChannels == outChannels) { + val totalBytes = numberOfFrames * outChannels * 2 + val out = ByteArray(totalBytes) + buffer.get(out, 0, totalBytes.coerceAtMost(buffer.remaining())) + return out + } - // Prepare buffer for reading - val buffer = audioData.duplicate() - buffer.order(ByteOrder.LITTLE_ENDIAN) - buffer.rewind() + val out = ByteArray(numberOfFrames * outChannels * 2) + val outBuf = ByteBuffer.wrap(out).order(ByteOrder.LITTLE_ENDIAN) when (bitsPerSample) { 16 -> { - // Already 16-bit, just reformat by channels - for (channel in 0 until numberOfChannels) { - val channelData = mutableListOf() - buffer.position(0) // Start from beginning for each channel - - for (frame in 0 until numberOfFrames) { - val sampleIndex = frame * numberOfChannels + channel - val byteIndex = sampleIndex * 2 - + for (frame in 0 until numberOfFrames) { + val srcOffset = frame * srcChannels * 2 + for (ch in 0 until outChannels) { + val byteIndex = srcOffset + ch * 2 if (byteIndex + 1 < buffer.capacity()) { buffer.position(byteIndex) - val sample = buffer.short.toInt() - channelData.add(sample) + outBuf.putShort((frame * outChannels + ch) * 2, buffer.short) } } - channelsData.add(channelData) } } - 32 -> { - // Convert from 32-bit to 16-bit - for (channel in 0 until numberOfChannels) { - val channelData = mutableListOf() - buffer.position(0) - - for (frame in 0 until numberOfFrames) { - val sampleIndex = frame * numberOfChannels + channel - val byteIndex = sampleIndex * 4 - + for (frame in 0 until numberOfFrames) { + val srcOffset = frame * srcChannels * 4 + for (ch in 0 until outChannels) { + val byteIndex = srcOffset + ch * 4 if (byteIndex + 3 < buffer.capacity()) { buffer.position(byteIndex) - val sample32 = buffer.int - // Convert 32-bit to 16-bit by right-shifting - val sample16 = (sample32 shr 16).toShort().toInt() - channelData.add(sample16) + val sample16 = (buffer.int shr 16).toShort() + outBuf.putShort((frame * outChannels + ch) * 2, sample16) } } - channelsData.add(channelData) - } - } - - else -> { - // Unsupported format, return empty data - repeat(numberOfChannels) { - channelsData.add(emptyList()) } } } - return channelsData + return out } - private fun convertToFloat32( - audioData: ByteBuffer, + private fun extractAsFloat32Bytes( + buffer: ByteBuffer, bitsPerSample: Int, - numberOfChannels: Int, + srcChannels: Int, + outChannels: Int, numberOfFrames: Int - ): List> { - val channelsData = mutableListOf>() + ): ByteArray { + // Fast path for float32 with matching channel count. + if (bitsPerSample == 32 && srcChannels == outChannels) { + val totalBytes = numberOfFrames * outChannels * 4 + val out = ByteArray(totalBytes) + buffer.get(out, 0, totalBytes.coerceAtMost(buffer.remaining())) + return out + } - val buffer = audioData.duplicate() - buffer.order(ByteOrder.LITTLE_ENDIAN) - buffer.rewind() + val out = ByteArray(numberOfFrames * outChannels * 4) + val outBuf = ByteBuffer.wrap(out).order(ByteOrder.LITTLE_ENDIAN) when (bitsPerSample) { 16 -> { - // Convert from 16-bit to float32 - for (channel in 0 until numberOfChannels) { - val channelData = mutableListOf() - buffer.position(0) - - for (frame in 0 until numberOfFrames) { - val sampleIndex = frame * numberOfChannels + channel - val byteIndex = sampleIndex * 2 - + for (frame in 0 until numberOfFrames) { + val srcOffset = frame * srcChannels * 2 + for (ch in 0 until outChannels) { + val byteIndex = srcOffset + ch * 2 if (byteIndex + 1 < buffer.capacity()) { buffer.position(byteIndex) - val sample16 = buffer.short - // Convert to float (-1.0 to 1.0) - val sampleFloat = sample16.toFloat() / Short.MAX_VALUE - channelData.add(sampleFloat) + val sampleFloat = buffer.short.toFloat() / Short.MAX_VALUE + outBuf.putFloat((frame * outChannels + ch) * 4, sampleFloat) } } - channelsData.add(channelData) } } - 32 -> { - // Assume 32-bit float input - for (channel in 0 until numberOfChannels) { - val channelData = mutableListOf() - buffer.position(0) - - for (frame in 0 until numberOfFrames) { - val sampleIndex = frame * numberOfChannels + channel - val byteIndex = sampleIndex * 4 - + for (frame in 0 until numberOfFrames) { + val srcOffset = frame * srcChannels * 4 + for (ch in 0 until outChannels) { + val byteIndex = srcOffset + ch * 4 if (byteIndex + 3 < buffer.capacity()) { buffer.position(byteIndex) - val sampleFloat = buffer.float - channelData.add(sampleFloat) + outBuf.putFloat((frame * outChannels + ch) * 4, buffer.float) } } - channelsData.add(channelData) - } - } - - else -> { - // Unsupported format - repeat(numberOfChannels) { - channelsData.add(emptyList()) } } } - return channelsData + return out } } diff --git a/lib/src/preconnect/pre_connect_audio_buffer.dart b/lib/src/preconnect/pre_connect_audio_buffer.dart index 17f465ba3..d01cea2cb 100644 --- a/lib/src/preconnect/pre_connect_audio_buffer.dart +++ b/lib/src/preconnect/pre_connect_audio_buffer.dart @@ -13,7 +13,6 @@ // limitations under the License. import 'dart:async'; -import 'dart:typed_data'; import 'package:flutter/services.dart'; @@ -41,8 +40,8 @@ typedef PreConnectOnError = void Function(Object error); /// still connecting and dispatching an agent, then the buffered audio is sent /// once the agent becomes active. /// -/// Audio is buffered in memory (bounded by [defaultMaxSize]); if it overflows, -/// the oldest audio is dropped until the agent is ready. +/// Audio is buffered in memory and bounded by [defaultMaxSize]. +/// If it overflows, the oldest audio is dropped until the agent is ready. class PreConnectAudioBuffer { /// Topic used to send the buffered audio stream to agents. static const String dataTopic = 'lk.agent.pre-connect-audio-buffer'; @@ -66,6 +65,7 @@ class PreConnectAudioBuffer { PreConnectOnError? _onError; final int _requestSampleRate; int? _renderedSampleRate; + int? _renderedChannels; bool _nativeRecordingStarted = false; bool _hasLoggedOverflow = false; @@ -107,8 +107,8 @@ class PreConnectAudioBuffer { /// [agentReadyFuture] completes with an error and callers should [reset] the /// buffer. /// - /// - Note: Ensure microphone permissions are granted before calling this, or - /// audio capture may fail depending on platform. + /// Ensure microphone permissions are granted before calling this. + /// Audio capture may fail without permissions. Future startRecording({ Duration timeout = const Duration(seconds: 20), }) async { @@ -161,13 +161,11 @@ class PreConnectAudioBuffer { } try { - // Actual sample rate of the audio data, can differ from the request sample rate + // Audio format can differ from what was requested. _renderedSampleRate = event['sampleRate'] as int; - final dataChannels = event['data'] as List; - final monoData = dataChannels[0].cast(); - // Convert Int16 values to bytes using typed data view - final int16List = Int16List.fromList(monoData); - final bytes = int16List.buffer.asUint8List(); + _renderedChannels = event['channels'] as int; + // Native sends raw interleaved PCM bytes. + final Uint8List bytes = event['data'] as Uint8List; final didOverflow = _buffer.write(bytes); if (didOverflow && !_hasLoggedOverflow) { @@ -181,7 +179,7 @@ class PreConnectAudioBuffer { } }); - // Listen for agent readiness; when active, attempt to send buffer once. + // Listen for agent readiness and send the buffer when active. _participantStateListener = _room.events.on( filter: (event) => event.participant.kind == ParticipantKind.AGENT && event.state == ParticipantState.active, (event) async { @@ -260,7 +258,7 @@ class PreConnectAudioBuffer { _participantStateListener = null; _buffer.clear(); - // Don't stop the local track - it will continue to be used by the Room + // Keep the local track because the Room still uses it. _localTrack = null; _agentReadyManager.reset(); @@ -269,6 +267,7 @@ class PreConnectAudioBuffer { // Reset the _isSent flag to allow data sending on next use _isBufferSent = false; _hasLoggedOverflow = false; + _renderedChannels = null; logger.info('[Preconnect audio] reset'); } @@ -281,8 +280,8 @@ class PreConnectAudioBuffer { /// Sends the currently buffered audio to one or more agent identities. /// - /// This is a one-shot operation; repeated calls are ignored after the buffer - /// has been sent. + /// This is a one shot operation. + /// Repeated calls are ignored after the buffer has been sent. /// /// The stream is written to [topic] (default: [dataTopic]) and includes /// attributes that help the agent interpret the raw audio payload. @@ -294,10 +293,15 @@ class PreConnectAudioBuffer { if (agents.isEmpty) return; final sampleRate = _renderedSampleRate; + final rawChannels = _renderedChannels ?? 1; + final channels = rawChannels > 0 ? rawChannels : 1; if (sampleRate == null) { logger.severe('[Preconnect audio] renderedSampleRate is null'); return; } + if (rawChannels <= 0) { + logger.warning('[Preconnect audio] Invalid rendered channels: $rawChannels. Falling back to mono.'); + } // Wait for local track published event final localTrackPublishedEvent = await _localTrackPublishedEvent; @@ -320,7 +324,7 @@ class PreConnectAudioBuffer { topic: topic, attributes: { 'sampleRate': sampleRate.toString(), - 'channels': '1', + 'channels': channels.toString(), 'trackId': localTrackSid, }, totalSize: data.length, @@ -334,9 +338,10 @@ class PreConnectAudioBuffer { await writer.close(); // Compute seconds of audio data sent - final int bytesPerSample = 2; // Assuming 16-bit audio - final int totalSamples = data.length ~/ bytesPerSample; - final double secondsOfAudio = totalSamples / sampleRate; + final int bytesPerSample = 2; // 16-bit audio + final int bytesPerFrame = bytesPerSample * channels; + final int totalFrames = data.length ~/ bytesPerFrame; + final double secondsOfAudio = totalFrames / sampleRate; logger.info( '[Preconnect audio] sent ${(data.length / 1024).toStringAsFixed(1)}KB of audio (${secondsOfAudio.toStringAsFixed(2)} seconds) to ${agents} agent(s)'); diff --git a/shared_swift/AudioRenderer.swift b/shared_swift/AudioRenderer.swift index 161af309a..3f8127323 100644 --- a/shared_swift/AudioRenderer.swift +++ b/shared_swift/AudioRenderer.swift @@ -76,66 +76,79 @@ extension AudioRenderer: FlutterStreamHandler { } public extension AVAudioPCMBuffer { - func serialize() -> [String: Any] { - // The format of the data: - // { - // "sampleRate": 48000.0, - // "channelCount": 2, - // "frameLength": 480, - // "format": "float32", // or "int16", "int32", "unknown" - // "data": [ - // [/* channel 0 audio samples */], - // [/* channel 1 audio samples */] - // ] - // } - - // Create the result dictionary to send to Flutter + /// Serializes audio data as raw interleaved bytes. + /// + /// Mono buffers are copied directly. + /// Multi-channel buffers are interleaved in sample order. + /// + /// Uses `FlutterStandardTypedData` to send a binary payload. + func serializeAsBytes() -> [String: Any] { + let channels = Int(format.channelCount) + let frames = Int(frameLength) + var result: [String: Any] = [ "sampleRate": UInt(format.sampleRate), - "channels": UInt(format.channelCount), - "frameLength": UInt(frameLength), + "channels": UInt(channels), ] - // Extract audio data based on the buffer format - if let floatChannelData { - // Buffer contains float data - var channelsData: [[Float]] = [] - - for channel in 0 ..< Int(format.channelCount) { - let channelPointer = floatChannelData[channel] - let channelArray = Array(UnsafeBufferPointer(start: channelPointer, count: Int(frameLength))) - channelsData.append(channelArray) - } - - result["data"] = channelsData - result["commonFormat"] = "float32" - } else if let int16ChannelData { - // Buffer contains int16 data - var channelsData: [[Int16]] = [] - - for channel in 0 ..< Int(format.channelCount) { - let channelPointer = int16ChannelData[channel] - let channelArray = Array(UnsafeBufferPointer(start: channelPointer, count: Int(frameLength))) - channelsData.append(channelArray) + if let int16ChannelData { + let data: Data + if channels == 1 { + // Fast path for mono. + data = Data(bytes: int16ChannelData[0], count: frames * MemoryLayout.size) + } else { + // Interleave channels + var bytes = Data(count: frames * channels * MemoryLayout.size) + bytes.withUnsafeMutableBytes { raw in + let out = raw.bindMemory(to: Int16.self) + for frame in 0...size) + } else { + var bytes = Data(count: frames * channels * MemoryLayout.size) + bytes.withUnsafeMutableBytes { raw in + let out = raw.bindMemory(to: Float32.self) + for frame in 0..(start: channelPointer, count: Int(frameLength))) - channelsData.append(channelArray) + let data: Data + if channels == 1 { + data = Data(bytes: int32ChannelData[0], count: frames * MemoryLayout.size) + } else { + var bytes = Data(count: frames * channels * MemoryLayout.size) + bytes.withUnsafeMutableBytes { raw in + let out = raw.bindMemory(to: Int32.self) + for frame in 0..