Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .changes/raw-bytes-audio-renderer
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
patch type="performance" "Send raw PCM bytes in audio renderer instead of boxed int arrays"
178 changes: 78 additions & 100 deletions android/src/main/kotlin/io/livekit/plugin/AudioRenderer.kt
Original file line number Diff line number Diff line change
Expand Up @@ -105,175 +105,153 @@ class AudioRenderer(
}
}

/**
* Converts audio data to raw interleaved bytes.
*
* If source and target channel counts match, data is copied directly.
* If target requests fewer channels, the first channels are kept and interleaved.
*
* Sends raw byte arrays instead of boxed sample lists.
*/
private fun convertAudioData(
audioData: ByteBuffer,
bitsPerSample: Int,
sampleRate: Int,
numberOfChannels: Int,
numberOfFrames: Int
): Map<String, Any> {
// Create result similar to iOS implementation
require(bitsPerSample == 16 || bitsPerSample == 32) {
"Unsupported bitsPerSample: $bitsPerSample"
}
require(numberOfChannels > 0) {
"Invalid numberOfChannels: $numberOfChannels"
}

val outChannels = targetFormat.numberOfChannels.coerceAtMost(numberOfChannels)

val result = mutableMapOf<String, Any>(
"sampleRate" to sampleRate,
"channels" to numberOfChannels,
"frameLength" to numberOfFrames
"channels" to outChannels,
)

// Convert based on target format
val buffer = audioData.duplicate()
buffer.order(ByteOrder.LITTLE_ENDIAN)
buffer.rewind()

when (targetFormat.commonFormat) {
"int16" -> {
result["commonFormat"] = "int16"
result["data"] =
convertToInt16(audioData, bitsPerSample, numberOfChannels, numberOfFrames)
result["data"] = extractAsInt16Bytes(buffer, bitsPerSample, numberOfChannels, outChannels, numberOfFrames)
}

"float32" -> {
result["commonFormat"] = "float32"
result["data"] =
convertToFloat32(audioData, bitsPerSample, numberOfChannels, numberOfFrames)
result["data"] = extractAsFloat32Bytes(buffer, bitsPerSample, numberOfChannels, outChannels, numberOfFrames)
}

else -> {
result["commonFormat"] = "int16" // Default fallback
result["data"] =
convertToInt16(audioData, bitsPerSample, numberOfChannels, numberOfFrames)
result["commonFormat"] = "int16"
result["data"] = extractAsInt16Bytes(buffer, bitsPerSample, numberOfChannels, outChannels, numberOfFrames)
}
}

return result
}

private fun convertToInt16(
audioData: ByteBuffer,
private fun extractAsInt16Bytes(
buffer: ByteBuffer,
bitsPerSample: Int,
numberOfChannels: Int,
srcChannels: Int,
outChannels: Int,
numberOfFrames: Int
): List<List<Int>> {
val channelsData = mutableListOf<List<Int>>()
): ByteArray {
// Fast path for int16 with matching channel count.
if (bitsPerSample == 16 && srcChannels == outChannels) {
val totalBytes = numberOfFrames * outChannels * 2
val out = ByteArray(totalBytes)
buffer.get(out, 0, totalBytes.coerceAtMost(buffer.remaining()))
return out
}

// Prepare buffer for reading
val buffer = audioData.duplicate()
buffer.order(ByteOrder.LITTLE_ENDIAN)
buffer.rewind()
val out = ByteArray(numberOfFrames * outChannels * 2)
val outBuf = ByteBuffer.wrap(out).order(ByteOrder.LITTLE_ENDIAN)

when (bitsPerSample) {
16 -> {
// Already 16-bit, just reformat by channels
for (channel in 0 until numberOfChannels) {
val channelData = mutableListOf<Int>()
buffer.position(0) // Start from beginning for each channel

for (frame in 0 until numberOfFrames) {
val sampleIndex = frame * numberOfChannels + channel
val byteIndex = sampleIndex * 2

for (frame in 0 until numberOfFrames) {
val srcOffset = frame * srcChannels * 2
for (ch in 0 until outChannels) {
val byteIndex = srcOffset + ch * 2
if (byteIndex + 1 < buffer.capacity()) {
buffer.position(byteIndex)
val sample = buffer.short.toInt()
channelData.add(sample)
outBuf.putShort((frame * outChannels + ch) * 2, buffer.short)
}
}
channelsData.add(channelData)
}
}

32 -> {
// Convert from 32-bit to 16-bit
for (channel in 0 until numberOfChannels) {
val channelData = mutableListOf<Int>()
buffer.position(0)

for (frame in 0 until numberOfFrames) {
val sampleIndex = frame * numberOfChannels + channel
val byteIndex = sampleIndex * 4

for (frame in 0 until numberOfFrames) {
val srcOffset = frame * srcChannels * 4
for (ch in 0 until outChannels) {
val byteIndex = srcOffset + ch * 4
if (byteIndex + 3 < buffer.capacity()) {
buffer.position(byteIndex)
val sample32 = buffer.int
// Convert 32-bit to 16-bit by right-shifting
val sample16 = (sample32 shr 16).toShort().toInt()
channelData.add(sample16)
val sample16 = (buffer.int shr 16).toShort()
outBuf.putShort((frame * outChannels + ch) * 2, sample16)
}
}
channelsData.add(channelData)
}
}

else -> {
// Unsupported format, return empty data
repeat(numberOfChannels) {
channelsData.add(emptyList())
}
}
}

return channelsData
return out
}

private fun convertToFloat32(
audioData: ByteBuffer,
private fun extractAsFloat32Bytes(
buffer: ByteBuffer,
bitsPerSample: Int,
numberOfChannels: Int,
srcChannels: Int,
outChannels: Int,
numberOfFrames: Int
): List<List<Float>> {
val channelsData = mutableListOf<List<Float>>()
): ByteArray {
// Fast path for float32 with matching channel count.
if (bitsPerSample == 32 && srcChannels == outChannels) {
val totalBytes = numberOfFrames * outChannels * 4
val out = ByteArray(totalBytes)
buffer.get(out, 0, totalBytes.coerceAtMost(buffer.remaining()))
return out
}

val buffer = audioData.duplicate()
buffer.order(ByteOrder.LITTLE_ENDIAN)
buffer.rewind()
val out = ByteArray(numberOfFrames * outChannels * 4)
val outBuf = ByteBuffer.wrap(out).order(ByteOrder.LITTLE_ENDIAN)

when (bitsPerSample) {
16 -> {
// Convert from 16-bit to float32
for (channel in 0 until numberOfChannels) {
val channelData = mutableListOf<Float>()
buffer.position(0)

for (frame in 0 until numberOfFrames) {
val sampleIndex = frame * numberOfChannels + channel
val byteIndex = sampleIndex * 2

for (frame in 0 until numberOfFrames) {
val srcOffset = frame * srcChannels * 2
for (ch in 0 until outChannels) {
val byteIndex = srcOffset + ch * 2
if (byteIndex + 1 < buffer.capacity()) {
buffer.position(byteIndex)
val sample16 = buffer.short
// Convert to float (-1.0 to 1.0)
val sampleFloat = sample16.toFloat() / Short.MAX_VALUE
channelData.add(sampleFloat)
val sampleFloat = buffer.short.toFloat() / Short.MAX_VALUE
outBuf.putFloat((frame * outChannels + ch) * 4, sampleFloat)
}
}
channelsData.add(channelData)
}
}

32 -> {
// Assume 32-bit float input
for (channel in 0 until numberOfChannels) {
val channelData = mutableListOf<Float>()
buffer.position(0)

for (frame in 0 until numberOfFrames) {
val sampleIndex = frame * numberOfChannels + channel
val byteIndex = sampleIndex * 4

for (frame in 0 until numberOfFrames) {
val srcOffset = frame * srcChannels * 4
for (ch in 0 until outChannels) {
val byteIndex = srcOffset + ch * 4
if (byteIndex + 3 < buffer.capacity()) {
buffer.position(byteIndex)
val sampleFloat = buffer.float
channelData.add(sampleFloat)
outBuf.putFloat((frame * outChannels + ch) * 4, buffer.float)
}
}
channelsData.add(channelData)
}
}

else -> {
// Unsupported format
repeat(numberOfChannels) {
channelsData.add(emptyList())
}
}
}

return channelsData
return out
}
}

Expand Down
43 changes: 24 additions & 19 deletions lib/src/preconnect/pre_connect_audio_buffer.dart
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
// limitations under the License.

import 'dart:async';
import 'dart:typed_data';

import 'package:flutter/services.dart';

Expand Down Expand Up @@ -41,8 +40,8 @@ typedef PreConnectOnError = void Function(Object error);
/// still connecting and dispatching an agent, then the buffered audio is sent
/// once the agent becomes active.
///
/// Audio is buffered in memory (bounded by [defaultMaxSize]); if it overflows,
/// the oldest audio is dropped until the agent is ready.
/// Audio is buffered in memory and bounded by [defaultMaxSize].
/// If it overflows, the oldest audio is dropped until the agent is ready.
class PreConnectAudioBuffer {
/// Topic used to send the buffered audio stream to agents.
static const String dataTopic = 'lk.agent.pre-connect-audio-buffer';
Expand All @@ -66,6 +65,7 @@ class PreConnectAudioBuffer {
PreConnectOnError? _onError;
final int _requestSampleRate;
int? _renderedSampleRate;
int? _renderedChannels;

bool _nativeRecordingStarted = false;
bool _hasLoggedOverflow = false;
Expand Down Expand Up @@ -107,8 +107,8 @@ class PreConnectAudioBuffer {
/// [agentReadyFuture] completes with an error and callers should [reset] the
/// buffer.
///
/// - Note: Ensure microphone permissions are granted before calling this, or
/// audio capture may fail depending on platform.
/// Ensure microphone permissions are granted before calling this.
/// Audio capture may fail without permissions.
Future<void> startRecording({
Duration timeout = const Duration(seconds: 20),
}) async {
Expand Down Expand Up @@ -161,13 +161,11 @@ class PreConnectAudioBuffer {
}

try {
// Actual sample rate of the audio data, can differ from the request sample rate
// Audio format can differ from what was requested.
_renderedSampleRate = event['sampleRate'] as int;
final dataChannels = event['data'] as List<dynamic>;
final monoData = dataChannels[0].cast<int>();
// Convert Int16 values to bytes using typed data view
final int16List = Int16List.fromList(monoData);
final bytes = int16List.buffer.asUint8List();
_renderedChannels = event['channels'] as int;
// Native sends raw interleaved PCM bytes.
final Uint8List bytes = event['data'] as Uint8List;

final didOverflow = _buffer.write(bytes);
if (didOverflow && !_hasLoggedOverflow) {
Expand All @@ -181,7 +179,7 @@ class PreConnectAudioBuffer {
}
});

// Listen for agent readiness; when active, attempt to send buffer once.
// Listen for agent readiness and send the buffer when active.
_participantStateListener = _room.events.on<ParticipantStateUpdatedEvent>(
filter: (event) => event.participant.kind == ParticipantKind.AGENT && event.state == ParticipantState.active,
(event) async {
Expand Down Expand Up @@ -260,7 +258,7 @@ class PreConnectAudioBuffer {
_participantStateListener = null;
_buffer.clear();

// Don't stop the local track - it will continue to be used by the Room
// Keep the local track because the Room still uses it.
_localTrack = null;

_agentReadyManager.reset();
Expand All @@ -269,6 +267,7 @@ class PreConnectAudioBuffer {
// Reset the _isSent flag to allow data sending on next use
_isBufferSent = false;
_hasLoggedOverflow = false;
_renderedChannels = null;

logger.info('[Preconnect audio] reset');
}
Expand All @@ -281,8 +280,8 @@ class PreConnectAudioBuffer {

/// Sends the currently buffered audio to one or more agent identities.
///
/// This is a one-shot operation; repeated calls are ignored after the buffer
/// has been sent.
/// This is a one shot operation.
/// Repeated calls are ignored after the buffer has been sent.
///
/// The stream is written to [topic] (default: [dataTopic]) and includes
/// attributes that help the agent interpret the raw audio payload.
Expand All @@ -294,10 +293,15 @@ class PreConnectAudioBuffer {
if (agents.isEmpty) return;

final sampleRate = _renderedSampleRate;
final rawChannels = _renderedChannels ?? 1;
final channels = rawChannels > 0 ? rawChannels : 1;
if (sampleRate == null) {
logger.severe('[Preconnect audio] renderedSampleRate is null');
return;
}
if (rawChannels <= 0) {
logger.warning('[Preconnect audio] Invalid rendered channels: $rawChannels. Falling back to mono.');
}

// Wait for local track published event
final localTrackPublishedEvent = await _localTrackPublishedEvent;
Expand All @@ -320,7 +324,7 @@ class PreConnectAudioBuffer {
topic: topic,
attributes: {
'sampleRate': sampleRate.toString(),
'channels': '1',
'channels': channels.toString(),
'trackId': localTrackSid,
},
totalSize: data.length,
Expand All @@ -334,9 +338,10 @@ class PreConnectAudioBuffer {
await writer.close();

// Compute seconds of audio data sent
final int bytesPerSample = 2; // Assuming 16-bit audio
final int totalSamples = data.length ~/ bytesPerSample;
final double secondsOfAudio = totalSamples / sampleRate;
final int bytesPerSample = 2; // 16-bit audio
final int bytesPerFrame = bytesPerSample * channels;
final int totalFrames = data.length ~/ bytesPerFrame;
final double secondsOfAudio = totalFrames / sampleRate;

logger.info(
'[Preconnect audio] sent ${(data.length / 1024).toStringAsFixed(1)}KB of audio (${secondsOfAudio.toStringAsFixed(2)} seconds) to ${agents} agent(s)');
Expand Down
Loading