Why am I getting blips when encoding a sound file using Java JNA?

https://stackoverflow.com/questions/18392817

26-06-2022
|

Question

I have implemented a hello world libavcodec using JNA to generate a wav file containing a pure 440Hz sine wave. But when I actually run the program the wav file contains annoying clicks and blips (compare to pure sin wav created from the C program). How am I calling avcodec_encode_audio2 wrong?

Here is my Java code. All the sources are also at github in case you want to try to compile it.

import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.ByteOrder;
import java.nio.IntBuffer;
import java.util.Objects;

import javax.sound.sampled.AudioFormat;
import javax.sound.sampled.AudioSystem;
import javax.sound.sampled.DataLine;
import javax.sound.sampled.LineUnavailableException;
import javax.sound.sampled.TargetDataLine;


public class Sin {
    /**
     * Abstract class that allows you to put the initialization and cleanup
     * code at the same place instead of separated by the big try block.
     */
    public static abstract class SharedPtr<T> implements AutoCloseable {
        public T ptr;
        public SharedPtr(T ptr) {
            this.ptr = ptr;
        }
        /**
         * Abstract override forces method to throw no checked exceptions.
         * Subclasses will call a C function that throws no exceptions.
         */
        @Override public abstract void close();
    }

    /**
     * @param args
     * @throws IOException 
     * @throws LineUnavailableException 
     */
    public static void main(String[] args) throws IOException, LineUnavailableException {
        final AvcodecLibrary avcodec = AvcodecLibrary.INSTANCE;
        final AvformatLibrary avformat = AvformatLibrary.INSTANCE;
        final AvutilLibrary avutil = AvutilLibrary.INSTANCE;
        avcodec.avcodec_register_all();
        avformat.av_register_all();
        AVOutputFormat.ByReference format = null;
        String format_name = "wav", file_url = "file:sinjava.wav";
        for (AVOutputFormat.ByReference formatIter = avformat.av_oformat_next(null); formatIter != null; formatIter = avformat.av_oformat_next(formatIter)) {
            formatIter.setAutoWrite(false);
            String iterName = formatIter.name;
            if (format_name.equals(iterName)) {
                format = formatIter;
                break;
            }
        }
        Objects.requireNonNull(format);
        System.out.format("Found format %s%n", format_name);
        AVCodec codec = avcodec.avcodec_find_encoder(format.audio_codec);  // one of AvcodecLibrary.CodecID
        Objects.requireNonNull(codec);
        codec.setAutoWrite(false);
        try (
            SharedPtr<AVFormatContext> fmtCtxPtr = new SharedPtr<AVFormatContext>(avformat.avformat_alloc_context()) {@Override public void close(){if (null!=ptr) avformat.avformat_free_context(ptr);}};
            ) {
            AVFormatContext fmtCtx = Objects.requireNonNull(fmtCtxPtr.ptr);
            fmtCtx.setAutoWrite(false);
            fmtCtx.setAutoRead(false);
            fmtCtx.oformat = format; fmtCtx.writeField("oformat");

            AVStream st = avformat.avformat_new_stream(fmtCtx, codec);
            if (null == st)
                throw new IllegalStateException();
            AVCodecContext c = st.codec;
            if (null == c)
                throw new IllegalStateException();
            st.setAutoWrite(false);
            fmtCtx.readField("nb_streams");
            st.id = fmtCtx.nb_streams - 1; st.writeField("id");
            assert st.id >= 0;
            System.out.format("New stream: id=%d%n", st.id);

            if (0 != (format.flags & AvformatLibrary.AVFMT_GLOBALHEADER)) {
                c.flags |= AvcodecLibrary.CODEC_FLAG_GLOBAL_HEADER;
            }
            c.writeField("flags");

            c.bit_rate = 64000; c.writeField("bit_rate");
            int bestSampleRate;
            if (null == codec.supported_samplerates) {
                bestSampleRate = 44100;
            } else {
                bestSampleRate = 0;
                for (int offset = 0, sample_rate = codec.supported_samplerates.getInt(offset); sample_rate != 0; codec.supported_samplerates.getInt(++offset)) {
                    bestSampleRate = Math.max(bestSampleRate, sample_rate);
                }
                assert bestSampleRate > 0;
            }
            c.sample_rate = bestSampleRate; c.writeField("sample_rate");
            c.channel_layout = AvutilLibrary.AV_CH_LAYOUT_STEREO; c.writeField("channel_layout");
            c.channels = avutil.av_get_channel_layout_nb_channels(c.channel_layout); c.writeField("channels");
            assert 2 == c.channels;
            c.sample_fmt = AvutilLibrary.AVSampleFormat.AV_SAMPLE_FMT_S16; c.writeField("sample_fmt");
            c.time_base.num = 1;
            c.time_base.den = bestSampleRate;
            c.writeField("time_base");
            c.setAutoWrite(false);

            AudioFormat javaSoundFormat = new AudioFormat(bestSampleRate, Short.SIZE, c.channels, true, ByteOrder.nativeOrder() == ByteOrder.BIG_ENDIAN);
            DataLine.Info javaDataLineInfo = new DataLine.Info(TargetDataLine.class, javaSoundFormat);
            if (! AudioSystem.isLineSupported(javaDataLineInfo))
                throw new IllegalStateException();
            int err;
            if ((err = avcodec.avcodec_open(c, codec)) < 0) {
                throw new IllegalStateException();
            }
            assert c.channels != 0;

            AVIOContext.ByReference[] ioCtxReference = new AVIOContext.ByReference[1];
            if (0 != (err = avformat.avio_open(ioCtxReference, file_url, AvformatLibrary.AVIO_FLAG_WRITE))) {
                throw new IllegalStateException("averror " + err);
            }
            try (
                SharedPtr<AVIOContext.ByReference> ioCtxPtr = new SharedPtr<AVIOContext.ByReference>(ioCtxReference[0]) {@Override public void close(){if (null!=ptr) avutil.av_free(ptr.getPointer());}}
                ) {
                AVIOContext.ByReference ioCtx = Objects.requireNonNull(ioCtxPtr.ptr);
                fmtCtx.pb = ioCtx; fmtCtx.writeField("pb");
                int averr = avformat.avformat_write_header(fmtCtx, null);
                if (averr < 0) {
                    throw new IllegalStateException("" + averr);
                }
                st.read();  // it is modified by avformat_write_header
                System.out.format("Wrote header. fmtCtx->nb_streams=%d, st->time_base=%d/%d; st->avg_frame_rate=%d/%d%n", fmtCtx.nb_streams, st.time_base.num, st.time_base.den, st.avg_frame_rate.num, st.avg_frame_rate.den); 
                avformat.avio_flush(ioCtx);
                int frame_size = c.frame_size != 0 ? c.frame_size : 4096;
                int expectedBufferSize = frame_size * c.channels * (Short.SIZE/8);
                boolean supports_small_last_frame = c.frame_size == 0 ? true : 0 != (codec.capabilities & AvcodecLibrary.CODEC_CAP_SMALL_LAST_FRAME);
                int bufferSize = avutil.av_samples_get_buffer_size((IntBuffer)null, c.channels, frame_size, c.sample_fmt, 1);
                assert bufferSize == expectedBufferSize: String.format("expected %d; got %d", expectedBufferSize, bufferSize);
                ByteBuffer samples = ByteBuffer.allocate(expectedBufferSize);
                samples.order(ByteOrder.nativeOrder());
                int audio_time = 0;  // unit: (c.time_base) s = (1/c.sample_rate) s
                int audio_sample_count = supports_small_last_frame ?
                    3 * c.sample_rate :
                    3 * c.sample_rate / frame_size * frame_size;
                while (audio_time < audio_sample_count) {
                    int frame_audio_time = audio_time;
                    samples.clear();
                    int nb_samples_in_frame = 0;
                    // encode a single tone sound
                    for (; samples.hasRemaining() && audio_time < audio_sample_count; nb_samples_in_frame++, audio_time++) {
                        double x = 2*Math.PI*440/c.sample_rate * audio_time;
                        double y = 10000 * Math.sin(x);
                        samples.putShort((short) y);
                        samples.putShort((short) y);
                    }
                    samples.flip();
                    try (
                            SharedPtr<AVFrame> framePtr = new SharedPtr<AVFrame>(avcodec.avcodec_alloc_frame()) {@Override public void close() {if (null!=ptr) avutil.av_free(ptr.getPointer());}};
                            ) {
                        AVFrame frame = Objects.requireNonNull(framePtr.ptr);
                        frame.setAutoRead(false);  // will be an in param
                        frame.setAutoWrite(false);
                        frame.nb_samples = nb_samples_in_frame; frame.writeField("nb_samples"); // actually unused during encoding
                        // Presentation time, in AVStream.time_base units.
                        frame.pts = avutil.av_rescale_q(frame_audio_time, c.time_base, st.time_base);  // i * codec_time_base / st_time_base
                        frame.writeField("pts");

                        assert c.channels > 0;
                        int bytesPerSample = avutil.av_get_bytes_per_sample(c.sample_fmt);
                        assert bytesPerSample > 0;
                        if (0 != (err = avcodec.avcodec_fill_audio_frame(frame, c.channels, c.sample_fmt, samples, samples.capacity(), 1))) {
                            throw new IllegalStateException(""+err);
                        }
                        AVPacket packet = new AVPacket();  // one of the few structs from ffmpeg with guaranteed size
                        avcodec.av_init_packet(packet);
                        packet.size = 0;
                        packet.data = null;
                        packet.stream_index = st.index; packet.writeField("stream_index");
                        // encode the samples
                        IntBuffer gotPacket = IntBuffer.allocate(1);
                        if (0 != (err = avcodec.avcodec_encode_audio2(c, packet, frame, gotPacket))) {
                            throw new IllegalStateException("" + err);
                        } else if (0 != gotPacket.get()) {
                            packet.read();
                            averr = avformat.av_write_frame(fmtCtx, packet);
                            if (averr < 0)
                                throw new IllegalStateException("" + averr);
                        }
                        System.out.format("encoded frame: codec time = %d; pts=%d = av_rescale_q(%d,%d/%d,%d/%d) (%.02fs) contains %d samples (%.02fs); got_packet=%d; packet.size=%d%n",
                                frame_audio_time,
                                frame.pts,
                                frame_audio_time, st.codec.time_base.num,st.codec.time_base.den,st.time_base.num,st.time_base.den,
                                1.*frame_audio_time/c.sample_rate, frame.nb_samples, 1.*frame.nb_samples/c.sample_rate, gotPacket.array()[0], packet.size);
                    }
                }
                if (0 != (err = avformat.av_write_trailer(fmtCtx))) {
                    throw new IllegalStateException();
                }
                avformat.avio_flush(ioCtx);
            }
        }
        System.out.println("Done writing");
    }
}

I also rewrote it in C, and the C version works fine without any blips. But I can’t figure out how I am using the library differently; all the library function calls should be identical!

//! gcc --std=c99 sin.c $(pkg-config --cflags --libs libavutil libavformat libavcodec) -o sin
// sudo apt-get install libswscale-dev
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <math.h>

#include <libavutil/opt.h>
#include <libavutil/mathematics.h>
#include <libavformat/avformat.h>
#include <libswscale/swscale.h>
#include <libavcodec/avcodec.h>
int main(int argc, char *argv[]) {
  const char *format_name = "wav", *file_url = "file:sin.wav";
  avcodec_register_all();
  av_register_all();
  AVOutputFormat *format = NULL;
  for (AVOutputFormat *formatIter = av_oformat_next(NULL); formatIter != NULL; formatIter = av_oformat_next(formatIter)) {
    int hasEncoder = NULL != avcodec_find_encoder(formatIter->audio_codec);
    if (0 == strcmp(format_name, formatIter->name)) {
      format = formatIter;
      break;
    }
  }
  printf("Found format %s\n", format->name);
  AVCodec *codec = avcodec_find_encoder(format->audio_codec);
  if (! codec) {
    fprintf(stderr, "Could not find codec %d\n", format->audio_codec);
    exit(1);
  }
  AVFormatContext *fmtCtx = avformat_alloc_context();
  if (! fmtCtx) {
    fprintf(stderr, "error allocating AVFormatContext\n");
    exit(1);
  }
  fmtCtx->oformat = format;
  AVStream *st = avformat_new_stream(fmtCtx, codec);
  if (! st) {
    fprintf(stderr, "error allocating AVStream\n");
    exit(1);
  }
  if (fmtCtx->nb_streams != 1) {
    fprintf(stderr, "avformat_new_stream should have incremented nb_streams, but it's still %d\n", fmtCtx->nb_streams);
    exit(1);
  }
  AVCodecContext *c = st->codec;
  if (! c) {
    fprintf(stderr, "avformat_new_stream should have allocated a AVCodecContext for my stream\n");
    exit(1);
  }
  st->id = fmtCtx->nb_streams - 1;
  printf("Created stream %d\n", st->id);
  if (0 != (format->flags & AVFMT_GLOBALHEADER)) {
    c->flags |= CODEC_FLAG_GLOBAL_HEADER;
  }
  c->bit_rate = 64000;
  int bestSampleRate;
  if (NULL == codec->supported_samplerates) {
    bestSampleRate = 44100;
    printf("Setting sample rate: %d\n", bestSampleRate);
  } else {
    bestSampleRate = 0;
    for (const int *sample_rate_iter = codec->supported_samplerates; *sample_rate_iter != 0; sample_rate_iter++) {
      if (*sample_rate_iter >= bestSampleRate)
        bestSampleRate = *sample_rate_iter;
    }
    printf("Using best supported sample rate: %d\n", bestSampleRate);
  }
  c->sample_rate = bestSampleRate;
  c->channel_layout = AV_CH_LAYOUT_STEREO;
  c->channels = av_get_channel_layout_nb_channels(c->channel_layout);
  c->time_base.num = 1;
  c->time_base.den = c->sample_rate;
  if (c->channels != 2) {
    fprintf(stderr, "av_get_channel_layout_nb_channels returned %d instead of 2\n", c->channels);
    exit(1);
  }
  c->sample_fmt = AV_SAMPLE_FMT_S16;
  int averr;
  if ((averr = avcodec_open2(c, codec, NULL)) < 0) {
    fprintf(stderr, "avcodec_open2 returned error %d\n", averr);
    exit(1);
  }
  AVIOContext *ioCtx = NULL;
  if (0 != (averr = avio_open(&ioCtx, file_url, AVIO_FLAG_WRITE))) {
    fprintf(stderr, "avio_open returned error %d\n", averr);
    exit(1);
  }
  if (ioCtx == NULL) {
    fprintf(stderr, "AVIOContext should have been set by avio_open\n");
    exit(1);
  }
  fmtCtx->pb = ioCtx;
  if (0 != (averr = avformat_write_header(fmtCtx, NULL))) {
    fprintf(stderr, "avformat_write_header returned error %d\n", averr);
    exit(1);
  }
  printf("Wrote header. fmtCtx->nb_streams=%d, st->time_base=%d/%d; st->avg_frame_rate=%d/%d\n", fmtCtx->nb_streams, st->time_base.num, st->time_base.den, st->avg_frame_rate.num, st->avg_frame_rate.den);
  int align = 1;
  int sample_size = av_get_bytes_per_sample(c->sample_fmt);
  if (sample_size != sizeof(int16_t)) {
    fprintf(stderr, "expected sample size=%zu but got %d\n", sizeof(int16_t), sample_size);
    exit(1);
  }
  int frame_size = c->frame_size != 0 ? c->frame_size : 4096;
  int bufferSize = av_samples_get_buffer_size(NULL, c->channels, frame_size, c->sample_fmt, align);
  int expectedBufferSize = frame_size * c->channels * sample_size;
  int supports_small_last_frame = c->frame_size == 0 ? 1 : 0 != (codec->capabilities & CODEC_CAP_SMALL_LAST_FRAME);
  if (bufferSize != expectedBufferSize) {
    fprintf(stderr, "expected buffer size=%d but got %d\n", expectedBufferSize, bufferSize);
    exit(1);
  }
  int16_t *samples = (int16_t*)malloc(bufferSize);

  uint32_t audio_time = 0;  // unit: (1/c->sample_rate) s
  uint32_t audio_sample_count = supports_small_last_frame ?
    3 * c->sample_rate :
    3 * c->sample_rate / frame_size * frame_size;
  while (audio_time < audio_sample_count) {
    uint32_t frame_audio_time = audio_time; // unit: (1/c->sample_rate) s
    AVFrame *frame = avcodec_alloc_frame();
    if (frame == NULL) {
      fprintf(stderr, "avcodec_alloc_frame failed\n");
      exit(1);
    }
    for (uint32_t i = 0; i != frame_size && audio_time < audio_sample_count; i++, audio_time++) {
      samples[2*i] = samples[2*i + 1] = 10000 * sin(2*M_PI*440/c->sample_rate * audio_time);
      frame->nb_samples = i+1;  // actually unused during encoding
    }
    // frame->format = c->sample_fmt;  // unused during encoding
    frame->pts = av_rescale_q(frame_audio_time, c->time_base, st->time_base);
    if (0 != (averr = avcodec_fill_audio_frame(frame, c->channels, c->sample_fmt, (const uint8_t*)samples, bufferSize, align))) {
      fprintf(stderr, "avcodec_fill_audio_frame returned error %d\n", averr);
      exit(1);
    }
    AVPacket packet;
    av_init_packet(&packet);
    packet.data = NULL;
    packet.size = 0;
    int got_packet;
    if (0 != (averr = avcodec_encode_audio2(c, &packet, frame, &got_packet))) {
      fprintf(stderr, "avcodec_encode_audio2 returned error %d\n", averr);
      exit(1);
    }
    if (got_packet) {
        packet.stream_index = st->index;
      if (0 < (averr = av_write_frame(fmtCtx, &packet))) {
        fprintf(stderr, "av_write_frame returned error %d\n", averr);
        exit(1);
      } else if (averr == 1) {
        // end of stream wanted.
      }
    }
    printf("encoded frame: codec time = %u; format pts=%ld = av_rescale_q(%u,%d/%d,%d/%d) (%.02fs) contains %d samples (%.02fs); got_packet=%d; packet.size=%d\n",
        frame_audio_time,
        frame->pts,
        frame_audio_time, c->time_base.num, c->time_base.den, st->time_base.num, st->time_base.den,
        1.*frame_audio_time/c->sample_rate, frame->nb_samples, 1.*frame->nb_samples/c->sample_rate, got_packet, packet.size);
    av_free(frame);
  }
  free(samples);
  cleanupFile:
  if (0 != (averr = av_write_trailer(fmtCtx))) {
    fprintf(stderr, "av_write_trailer returned error %d\n", averr);
    exit(1);
  }

  avio_flush(ioCtx);
  avio_close(ioCtx);
  avformat_free_context(fmtCtx);
}

Solution

The problem was that ByteBuffer.allocate(int) creates a buffer whose address is not stable across JNA function calls. Every time you call a native function, it copies the bytes into a temporary array just for that invocation. By contrast, ByteBuffer.allocateDirect(int) creates a buffer whose native pointers are stable. This is apparently a well-known pitfall of using ByteBuffer in JNA, but I didn’t notice it in the fine print of Using Pointers and Arrays.

So I just had to fix the samples creation to ByteBuffer samples = ByteBuffer.allocateDirect(expectedBufferSize);. The subsequent avcodec_fill_audio_frame call does not copy samples; it simply points the frame->data[0] to the uint8_t* address, so the samples array needs to have a stable address.

OTHER TIPS

Without having done what you are doing, I suspect the garbage collector.

See How can I disable Java garbage collector? - it says you can't, so increase the memory.

Licensed under: CC-BY-SA with attribution

Not affiliated with StackOverflow