Transcode of H.264 to VP8 using libav* has incorrect frame rate

https://stackoverflow.com/questions/23098080

04-07-2023
|

Question

I’ve so far failed to get the correct output frame rate when transcoding H.264 to VP8 with the libav* libraries. I created a functioning encode of Sintel.2010.720p.mkv as WebM (VP8/Vorbis) using a modification of the transcoding.c example in the FFmpeg source. Unfortunately the resulting file is 48 fps unlike the 24 fps of the original and the output of the ffmpeg command I’m trying to mimic.

I noticed ffprobe produces a tbc of double the fps for this and other H.264 videos, while the tbc of the resulting VP8 stream produced by the ffmpeg command is the default 1000. The stock transcoding.c example copies the time base of the decoder to the encoder AVCodecContext, which is 1/48. Running the ffmpeg command through gdb it looks like the time base of the AVCodecContext is set to 1/24, but making that change alone only causes the resulting video to be slowed to twice the duration at 24 fps.

I can create a usable video, but the frame rate doubles. When the output frame rate is the correct 24 fps, the video is smooth but slowed to half speed.

Here is my modification of the example.

 /*
  * Copyright (c) 2010 Nicolas George
  * Copyright (c) 2011 Stefano Sabatini
  * Copyright (c) 2014 Andrey Utkin
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
  * in the Software without restriction, including without limitation the rights
  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  * copies of the Software, and to permit persons to whom the Software is
  * furnished to do so, subject to the following conditions:
  *
  * The above copyright notice and this permission notice shall be included in
  * all copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  * THE SOFTWARE.
  */

 /**
  * @file
  * API example for demuxing, decoding, filtering, encoding and muxing
  * @example doc/examples/transcoding.c
  */

 #include <libavcodec/avcodec.h>
 #include <libavformat/avformat.h>
 #include <libavfilter/avfiltergraph.h>
 #include <libavfilter/avcodec.h>
 #include <libavfilter/buffersink.h>
 #include <libavfilter/buffersrc.h>
 #include <libavutil/opt.h>
 #include <libavutil/pixdesc.h>

 #define STATS_LOG "stats.log"

 static AVFormatContext *ifmt_ctx;
 static AVFormatContext *ofmt_ctx;
 typedef struct FilteringContext {
   AVFilterContext *buffersink_ctx;
   AVFilterContext *buffersrc_ctx;
   AVFilterGraph *filter_graph;
 } FilteringContext;
 static FilteringContext *filter_ctx;

 static int open_input_file(const char *filename) {
   int ret;
   unsigned int i;

   ifmt_ctx = NULL;
   if ((ret = avformat_open_input(&ifmt_ctx, filename, NULL, NULL)) < 0) {
 av_log(NULL, AV_LOG_ERROR, "Cannot open input file\n");
 return ret;
   }

   if ((ret = avformat_find_stream_info(ifmt_ctx, NULL)) < 0) {
 av_log(NULL, AV_LOG_ERROR, "Cannot find stream information\n");
 return ret;
   }

   for (i = 0; i < ifmt_ctx->nb_streams; i++) {
 AVStream *stream;
 AVCodecContext *codec_ctx;
 stream = ifmt_ctx->streams[i];
 codec_ctx = stream->codec;
 /* Reencode video & audio and remux subtitles etc. */
 if (codec_ctx->codec_type == AVMEDIA_TYPE_VIDEO
     || codec_ctx->codec_type == AVMEDIA_TYPE_AUDIO) {
   /* Open decoder */
   ret = avcodec_open2(codec_ctx,
               avcodec_find_decoder(codec_ctx->codec_id), NULL);
   if (ret < 0) {
     av_log(NULL, AV_LOG_ERROR, "Failed to open decoder for stream #%u\n", i);
     return ret;
   }
 }
   }

   av_dump_format(ifmt_ctx, 0, filename, 0);
   return 0;
 }

 static int init_output_context(char* filename) {
   int ret;
   ofmt_ctx = NULL;

   avformat_alloc_output_context2(&ofmt_ctx, NULL, NULL, filename);
   if (!ofmt_ctx) {
 av_log(NULL, AV_LOG_ERROR, "Could not create output context\n");
 return AVERROR_UNKNOWN;
   }

   return 0;
 }

 static int init_webm_encoders(int audioBitRate, int crf, int videoMaxBitRate, int threads,
               char* quality, int speed, int pass, char* stats) {
   AVStream *out_stream;
   AVStream *in_stream;
   AVCodecContext *dec_ctx, *enc_ctx;
   AVCodec *encoder;
   int ret;
   unsigned int i;

   for (i = 0; i < ifmt_ctx->nb_streams; i++) {
 in_stream = ifmt_ctx->streams[i];
 dec_ctx = in_stream->codec;
 if (dec_ctx->codec_type == AVMEDIA_TYPE_VIDEO || dec_ctx->codec_type == AVMEDIA_TYPE_AUDIO) {

   AVDictionary *opts = NULL;
   if (dec_ctx->codec_type == AVMEDIA_TYPE_VIDEO) {
     encoder = avcodec_find_encoder(AV_CODEC_ID_VP8);
     out_stream = avformat_new_stream(ofmt_ctx, encoder);
     if (!out_stream) {
       av_log(NULL, AV_LOG_ERROR, "Failed allocating output stream\n");
       return AVERROR_UNKNOWN;
     }

     enc_ctx = out_stream->codec;
     enc_ctx->height = dec_ctx->height;
     enc_ctx->width = dec_ctx->width;
     enc_ctx->sample_aspect_ratio = dec_ctx->sample_aspect_ratio;
     /* take first format from list of supported formats */
     enc_ctx->pix_fmt = encoder->pix_fmts[0];
     /* video time_base can be set to whatever is handy and supported by encoder */
     enc_ctx->time_base = dec_ctx->time_base;
     /* enc_ctx->time_base.num = 1; */
     /* enc_ctx->time_base.den = 24; */

     enc_ctx->bit_rate = videoMaxBitRate;
     enc_ctx->thread_count = threads;
     switch (pass) {
     case 1:
       enc_ctx->flags |= CODEC_FLAG_PASS1;
       break;
     case 2:
       enc_ctx->flags |= CODEC_FLAG_PASS2;
       if (stats) {
     enc_ctx->stats_in = stats;
       }
       break;
     }

     char crfString[3];
     snprintf(crfString, 3, "%d", crf);
     av_dict_set(&opts, "crf", crfString, 0);
     av_dict_set(&opts, "quality", quality, 0);
     char speedString[3];
     snprintf(speedString, 3, "%d", speed);
     av_dict_set(&opts, "speed", speedString, 0);
   } else {
     encoder = avcodec_find_encoder(AV_CODEC_ID_VORBIS);
     out_stream = avformat_new_stream(ofmt_ctx, encoder);
     if (!out_stream) {
       av_log(NULL, AV_LOG_ERROR, "Failed allocating output stream\n");
       return AVERROR_UNKNOWN;
     }

     /* in_stream = ifmt_ctx->streams[i]; */
     /* dec_ctx = in_stream->codec; */
     enc_ctx = out_stream->codec;
     /* encoder = out_stream->codec->codec; */

     enc_ctx->sample_rate = dec_ctx->sample_rate;
     enc_ctx->channel_layout = dec_ctx->channel_layout;
     enc_ctx->channels = av_get_channel_layout_nb_channels(enc_ctx->channel_layout);
     /* take first format from list of supported formats */
     enc_ctx->sample_fmt = encoder->sample_fmts[0];
     enc_ctx->time_base = (AVRational){1, enc_ctx->sample_rate};
     enc_ctx->bit_rate = audioBitRate;
   }

   /* Open codec with the set options */
   ret = avcodec_open2(enc_ctx, encoder, &opts);
   if (ret < 0) {
     av_log(NULL, AV_LOG_ERROR, "Cannot open video encoder for stream #%u\n", i);
     return ret;
   }
   int unused = av_dict_count(opts);
   if (unused > 0) {
     av_log(NULL, AV_LOG_WARNING, "%d unused options\n", unused);
   }
   /* } else if (dec_ctx->codec_type == AVMEDIA_TYPE_UNKNOWN) { */
 } else {
   av_log(NULL, AV_LOG_FATAL, "Elementary stream #%d is of unknown type, cannot proceed\n", i);
   return AVERROR_INVALIDDATA;
 } /* else { */
   /*   /\* if this stream must be remuxed *\/ */
   /*   ret = avcodec_copy_context(ofmt_ctx->streams[i]->codec, */
   /*                ifmt_ctx->streams[i]->codec); */
   /*   if (ret < 0) { */
   /*   av_log(NULL, AV_LOG_ERROR, "Copying stream context failed\n"); */
   /*   return ret; */
   /*   } */
   /* } */

 if (ofmt_ctx->oformat->flags & AVFMT_GLOBALHEADER)
   enc_ctx->flags |= CODEC_FLAG_GLOBAL_HEADER;
   }

   return 0;
 }

 static int open_output_file(const char *filename) {
   int ret;

   av_dump_format(ofmt_ctx, 0, filename, 1);

   if (!(ofmt_ctx->oformat->flags & AVFMT_NOFILE)) {
 ret = avio_open(&ofmt_ctx->pb, filename, AVIO_FLAG_WRITE);
 if (ret < 0) {
   av_log(NULL, AV_LOG_ERROR, "Could not open output file '%s'", filename);
   return ret;
 }
   }

   /* init muxer, write output file header */
   ret = avformat_write_header(ofmt_ctx, NULL);
   if (ret < 0) {
 av_log(NULL, AV_LOG_ERROR, "Error occurred when opening output file\n");
 return ret;
   }

   return 0;
 }

 static int init_filter(FilteringContext* fctx, AVCodecContext *dec_ctx,
            AVCodecContext *enc_ctx, const char *filter_spec) {
   char args[512];
   int ret = 0;
   AVFilter *buffersrc = NULL;
   AVFilter *buffersink = NULL;
   AVFilterContext *buffersrc_ctx = NULL;
   AVFilterContext *buffersink_ctx = NULL;
   AVFilterInOut *outputs = avfilter_inout_alloc();
   AVFilterInOut *inputs  = avfilter_inout_alloc();
   AVFilterGraph *filter_graph = avfilter_graph_alloc();

   if (!outputs || !inputs || !filter_graph) {
 ret = AVERROR(ENOMEM);
 goto end;
   }

   if (dec_ctx->codec_type == AVMEDIA_TYPE_VIDEO) {
 buffersrc = avfilter_get_by_name("buffer");
 buffersink = avfilter_get_by_name("buffersink");
 if (!buffersrc || !buffersink) {
   av_log(NULL, AV_LOG_ERROR, "filtering source or sink element not found\n");
   ret = AVERROR_UNKNOWN;
   goto end;
 }

 snprintf(args, sizeof(args),
      "video_size=%dx%d:pix_fmt=%d:time_base=%d/%d:pixel_aspect=%d/%d",
      dec_ctx->width, dec_ctx->height, dec_ctx->pix_fmt,
      dec_ctx->time_base.num, dec_ctx->time_base.den,
      dec_ctx->sample_aspect_ratio.num,
      dec_ctx->sample_aspect_ratio.den);

 ret = avfilter_graph_create_filter(&buffersrc_ctx, buffersrc, "in",
                    args, NULL, filter_graph);
 if (ret < 0) {
   av_log(NULL, AV_LOG_ERROR, "Cannot create buffer source\n");
   goto end;
 }

 ret = avfilter_graph_create_filter(&buffersink_ctx, buffersink, "out",
                    NULL, NULL, filter_graph);
 if (ret < 0) {
   av_log(NULL, AV_LOG_ERROR, "Cannot create buffer sink\n");
   goto end;
 }

 ret = av_opt_set_bin(buffersink_ctx, "pix_fmts",
              (uint8_t*)&enc_ctx->pix_fmt, sizeof(enc_ctx->pix_fmt),
              AV_OPT_SEARCH_CHILDREN);
 if (ret < 0) {
   av_log(NULL, AV_LOG_ERROR, "Cannot set output pixel format\n");
   goto end;
 }
   } else if (dec_ctx->codec_type == AVMEDIA_TYPE_AUDIO) {
 buffersrc = avfilter_get_by_name("abuffer");
 buffersink = avfilter_get_by_name("abuffersink");
 if (!buffersrc || !buffersink) {
   av_log(NULL, AV_LOG_ERROR, "filtering source or sink element not found\n");
   ret = AVERROR_UNKNOWN;
   goto end;
 }

 if (!dec_ctx->channel_layout)
   dec_ctx->channel_layout =
     av_get_default_channel_layout(dec_ctx->channels);
 snprintf(args, sizeof(args),
      "time_base=%d/%d:sample_rate=%d:sample_fmt=%s:channel_layout=0x%"PRIx64,
      dec_ctx->time_base.num, dec_ctx->time_base.den, dec_ctx->sample_rate,
      av_get_sample_fmt_name(dec_ctx->sample_fmt),
      dec_ctx->channel_layout);
 ret = avfilter_graph_create_filter(&buffersrc_ctx, buffersrc, "in",
                    args, NULL, filter_graph);
 if (ret < 0) {
   av_log(NULL, AV_LOG_ERROR, "Cannot create audio buffer source\n");
   goto end;
 }

 ret = avfilter_graph_create_filter(&buffersink_ctx, buffersink, "out",
                    NULL, NULL, filter_graph);
 if (ret < 0) {
   av_log(NULL, AV_LOG_ERROR, "Cannot create audio buffer sink\n");
   goto end;
 }

 ret = av_opt_set_bin(buffersink_ctx, "sample_fmts",
              (uint8_t*)&enc_ctx->sample_fmt, sizeof(enc_ctx->sample_fmt),
              AV_OPT_SEARCH_CHILDREN);
 if (ret < 0) {
   av_log(NULL, AV_LOG_ERROR, "Cannot set output sample format\n");
   goto end;
 }

 ret = av_opt_set_bin(buffersink_ctx, "channel_layouts",
              (uint8_t*)&enc_ctx->channel_layout,
              sizeof(enc_ctx->channel_layout), AV_OPT_SEARCH_CHILDREN);
 if (ret < 0) {
   av_log(NULL, AV_LOG_ERROR, "Cannot set output channel layout\n");
   goto end;
 }

 ret = av_opt_set_bin(buffersink_ctx, "sample_rates",
              (uint8_t*)&enc_ctx->sample_rate, sizeof(enc_ctx->sample_rate),
              AV_OPT_SEARCH_CHILDREN);
 if (ret < 0) {
   av_log(NULL, AV_LOG_ERROR, "Cannot set output sample rate\n");
   goto end;
 }
   } else {
 ret = AVERROR_UNKNOWN;
 goto end;
   }

   /* Endpoints for the filter graph. */
   outputs->name       = av_strdup("in");
   outputs->filter_ctx = buffersrc_ctx;
   outputs->pad_idx    = 0;
   outputs->next       = NULL;

   inputs->name       = av_strdup("out");
   inputs->filter_ctx = buffersink_ctx;
   inputs->pad_idx    = 0;
   inputs->next       = NULL;

   if (!outputs->name || !inputs->name) {
 ret = AVERROR(ENOMEM);
 goto end;
   }

   if ((ret = avfilter_graph_parse_ptr(filter_graph, filter_spec,
                   &inputs, &outputs, NULL)) < 0)
 goto end;

   if ((ret = avfilter_graph_config(filter_graph, NULL)) < 0)
 goto end;

   /* Fill FilteringContext */
   fctx->buffersrc_ctx = buffersrc_ctx;
   fctx->buffersink_ctx = buffersink_ctx;
   fctx->filter_graph = filter_graph;

  end:
   avfilter_inout_free(&inputs);
   avfilter_inout_free(&outputs);

   return ret;
 }

 static int init_filters(enum AVCodecID audioCodec) {
   const char *filter_spec;
   unsigned int i;
   int ret;
   filter_ctx = av_malloc_array(ifmt_ctx->nb_streams, sizeof(*filter_ctx));
   if (!filter_ctx)
 return AVERROR(ENOMEM);

   for (i = 0; i < ifmt_ctx->nb_streams; i++) {
 filter_ctx[i].buffersrc_ctx  = NULL;
 filter_ctx[i].buffersink_ctx = NULL;
 filter_ctx[i].filter_graph   = NULL;
 /* Skip streams that are neither audio nor video */
 if (!(ifmt_ctx->streams[i]->codec->codec_type == AVMEDIA_TYPE_AUDIO
       || ifmt_ctx->streams[i]->codec->codec_type == AVMEDIA_TYPE_VIDEO))
   continue;


 if (ifmt_ctx->streams[i]->codec->codec_type == AVMEDIA_TYPE_VIDEO)
   filter_spec = "null"; /* passthrough (dummy) filter for video */
 else
   /* TODO: make this more general */
   if (audioCodec == AV_CODEC_ID_VORBIS) {
     filter_spec = "asetnsamples=n=64";
   } else {
     /* filter_spec = "null"; /\* passthrough (dummy) filter for audio *\/ */
     filter_spec = "fps=24";
     /* filter_spec = "settb=expr=1/24"; */
   }
 ret = init_filter(&filter_ctx[i], ifmt_ctx->streams[i]->codec,
           ofmt_ctx->streams[i]->codec, filter_spec);
 if (ret)
   return ret;
   }
   return 0;
 }

 static int encode_write_frame(AVFrame *filt_frame, unsigned int stream_index, int *got_frame) {
   int ret;
   int got_frame_local;
   AVPacket enc_pkt;
   int (*enc_func)(AVCodecContext *, AVPacket *, const AVFrame *, int *) =
 (ifmt_ctx->streams[stream_index]->codec->codec_type ==
  AVMEDIA_TYPE_VIDEO) ? avcodec_encode_video2 : avcodec_encode_audio2;

   if (!got_frame)
 got_frame = &got_frame_local;

   /* av_log(NULL, AV_LOG_INFO, "Encoding frame\n"); */
   /* encode filtered frame */
   enc_pkt.data = NULL;
   enc_pkt.size = 0;
   av_init_packet(&enc_pkt);
   ret = enc_func(ofmt_ctx->streams[stream_index]->codec, &enc_pkt,
          filt_frame, got_frame);
   av_frame_free(&filt_frame);
   if (ret < 0)
 return ret;
   if (!(*got_frame))
 return 0;

   /* prepare packet for muxing */
   enc_pkt.stream_index = stream_index;
   enc_pkt.dts = av_rescale_q_rnd(enc_pkt.dts,
                  ofmt_ctx->streams[stream_index]->codec->time_base,
                  ofmt_ctx->streams[stream_index]->time_base,
                  AV_ROUND_NEAR_INF|AV_ROUND_PASS_MINMAX);
   enc_pkt.pts = av_rescale_q_rnd(enc_pkt.pts,
                  ofmt_ctx->streams[stream_index]->codec->time_base,
                  ofmt_ctx->streams[stream_index]->time_base,
                  AV_ROUND_NEAR_INF|AV_ROUND_PASS_MINMAX);
   enc_pkt.duration = av_rescale_q(enc_pkt.duration,
                   ofmt_ctx->streams[stream_index]->codec->time_base,
                   ofmt_ctx->streams[stream_index]->time_base);

   /* av_log(NULL, AV_LOG_DEBUG, "Muxing frame\n"); */
   /* mux encoded frame */
   ret = av_interleaved_write_frame(ofmt_ctx, &enc_pkt);
   return ret;
 }

 static int filter_encode_write_frame(AVFrame *frame, unsigned int stream_index) {
   int ret;
   AVFrame *filt_frame;

   /* av_log(NULL, AV_LOG_INFO, "Pushing decoded frame to filters\n"); */
   /* push the decoded frame into the filtergraph */
   ret = av_buffersrc_add_frame_flags(filter_ctx[stream_index].buffersrc_ctx,
                  frame, 0);
   if (ret < 0) {
 av_log(NULL, AV_LOG_ERROR, "Error while feeding the filtergraph\n");
 return ret;
   }

   /* pull filtered frames from the filtergraph */
   while (1) {
 filt_frame = av_frame_alloc();
 if (!filt_frame) {
   ret = AVERROR(ENOMEM);
   break;
 }
 /* av_log(NULL, AV_LOG_INFO, "Pulling filtered frame from filters\n"); */
 ret = av_buffersink_get_frame(filter_ctx[stream_index].buffersink_ctx,
                   filt_frame);
 if (ret < 0) {
   /* if no more frames for output - returns AVERROR(EAGAIN)
    * if flushed and no more frames for output - returns AVERROR_EOF
    * rewrite retcode to 0 to show it as normal procedure completion
    */
   if (ret == AVERROR(EAGAIN) || ret == AVERROR_EOF)
     ret = 0;
   av_frame_free(&filt_frame);
   break;
 }

 filt_frame->pict_type = AV_PICTURE_TYPE_NONE;
 ret = encode_write_frame(filt_frame, stream_index, NULL);
 if (ret < 0)
   break;
   }

   return ret;
 }

 static int flush_encoder(unsigned int stream_index) {
   int ret;
   int got_frame;

   if (!(ofmt_ctx->streams[stream_index]->codec->codec->capabilities &
     CODEC_CAP_DELAY))
 return 0;

   while (1) {
 av_log(NULL, AV_LOG_INFO, "Flushing stream #%u encoder\n", stream_index);
 ret = encode_write_frame(NULL, stream_index, &got_frame);
 if (ret < 0)
   break;
 if (!got_frame)
   return 0;
   }
   return ret;
 }

 static int transcode() {
   int ret;
   AVPacket packet = { .data = NULL, .size = 0 };
   AVFrame *frame = NULL;
   enum AVMediaType type;
   unsigned int stream_index;
   unsigned int i;
   int got_frame;
   int (*dec_func)(AVCodecContext *, AVFrame *, int *, const AVPacket *);

   /* read all packets */
   while (1) {
 if ((ret = av_read_frame(ifmt_ctx, &packet)) < 0)
   break;
 stream_index = packet.stream_index;
 type = ifmt_ctx->streams[packet.stream_index]->codec->codec_type;
 av_log(NULL, AV_LOG_DEBUG, "Demuxer gave frame of stream_index %u\n",
    stream_index);

 if (filter_ctx[stream_index].filter_graph) {
   av_log(NULL, AV_LOG_DEBUG, "Going to reencode&filter the frame\n");
   frame = av_frame_alloc();
   if (!frame) {
     ret = AVERROR(ENOMEM);
     break;
   }
   packet.dts = av_rescale_q_rnd(packet.dts,
                 ifmt_ctx->streams[stream_index]->time_base,
                 ifmt_ctx->streams[stream_index]->codec->time_base,
                 AV_ROUND_NEAR_INF|AV_ROUND_PASS_MINMAX);
   packet.pts = av_rescale_q_rnd(packet.pts,
                 ifmt_ctx->streams[stream_index]->time_base,
                 ifmt_ctx->streams[stream_index]->codec->time_base,
                 AV_ROUND_NEAR_INF|AV_ROUND_PASS_MINMAX);
   dec_func = (type == AVMEDIA_TYPE_VIDEO) ? avcodec_decode_video2 :
     avcodec_decode_audio4;
   ret = dec_func(ifmt_ctx->streams[stream_index]->codec, frame,
          &got_frame, &packet);
   if (ret < 0) {
     av_frame_free(&frame);
     av_log(NULL, AV_LOG_ERROR, "Decoding failed\n");
     break;
   }

   if (got_frame) {
     frame->pts = av_frame_get_best_effort_timestamp(frame);
     ret = filter_encode_write_frame(frame, stream_index);
     av_frame_free(&frame);
     if (ret < 0)
       goto end;
   } else {
     av_frame_free(&frame);
   }
 } else {
   /* remux this frame without reencoding */
   packet.dts = av_rescale_q_rnd(packet.dts,
                 ifmt_ctx->streams[stream_index]->time_base,
                 ofmt_ctx->streams[stream_index]->time_base,
                 AV_ROUND_NEAR_INF|AV_ROUND_PASS_MINMAX);
   packet.pts = av_rescale_q_rnd(packet.pts,
                 ifmt_ctx->streams[stream_index]->time_base,
                 ofmt_ctx->streams[stream_index]->time_base,
                 AV_ROUND_NEAR_INF|AV_ROUND_PASS_MINMAX);

   ret = av_interleaved_write_frame(ofmt_ctx, &packet);
   if (ret < 0)
     goto end;
 }
 av_free_packet(&packet);
   }

   /* flush filters and encoders */
   for (i = 0; i < ifmt_ctx->nb_streams; i++) {
 /* flush filter */
 if (!filter_ctx[i].filter_graph)
   continue;
 ret = filter_encode_write_frame(NULL, i);
 if (ret < 0) {
   av_log(NULL, AV_LOG_ERROR, "Flushing filter failed\n");
   goto end;
 }

 /* flush encoder */
 ret = flush_encoder(i);
 if (ret < 0) {
   av_log(NULL, AV_LOG_ERROR, "Flushing encoder failed\n");
   goto end;
 }
   }

   av_write_trailer(ofmt_ctx);

   // Retrieve and store the first instance of codec statistics
   // TODO: less naive, deal with multiple instances of statistics
   for (i = 0; i < ofmt_ctx->nb_streams; i++) {
 AVCodecContext* codec = ofmt_ctx->streams[i]->codec;
 if ((codec->flags & CODEC_FLAG_PASS1) && (codec->stats_out)){
   FILE* logfile = fopen(STATS_LOG, "wb");
   fprintf(logfile, "%s", codec->stats_out);
   if (fclose(logfile) < 0) {
     av_log(NULL, AV_LOG_ERROR, "Error closing log file.\n");
   }
   break;
 }
   }

   av_log(NULL, AV_LOG_INFO, "output duration = %" PRId64 "\n", ofmt_ctx->duration);

  end:
   av_free_packet(&packet);
   av_frame_free(&frame);
   for (i = 0; i < ifmt_ctx->nb_streams; i++) {
 avcodec_close(ifmt_ctx->streams[i]->codec);
 if (ofmt_ctx && ofmt_ctx->nb_streams > i && ofmt_ctx->streams[i] && ofmt_ctx->streams[i]->codec)
   avcodec_close(ofmt_ctx->streams[i]->codec);
 if (filter_ctx && filter_ctx[i].filter_graph)
   avfilter_graph_free(&filter_ctx[i].filter_graph);
   }
   av_free(filter_ctx);
   avformat_close_input(&ifmt_ctx);
   if (ofmt_ctx && !(ofmt_ctx->oformat->flags & AVFMT_NOFILE))
 avio_close(ofmt_ctx->pb);
   avformat_free_context(ofmt_ctx);

   if (ret < 0)
 av_log(NULL, AV_LOG_ERROR, "Error occurred: %s\n", av_err2str(ret));

   return ret ? 1 : 0;
 }

 int TranscodeToWebM(char* inputPath, char* outputPath, int audioBitRate, int crf, int videoMaxBitRate, int threads,
         char* quality, int speed) {
   int ret;
   unsigned int pass;
   char* stats = NULL;

   av_register_all();
   avfilter_register_all();

   for (pass = 1; pass <= 2; pass++) {
 if ((ret = open_input_file(inputPath)) < 0)
   goto end;

 if ((ret = init_output_context(outputPath)) < 0)
   goto end;

 if (pass == 2) {
   size_t stats_length;
   if (cmdutils_read_file(STATS_LOG, &stats, &stats_length) < 0) {
     av_log(NULL, AV_LOG_ERROR, "Error reading stats file.\n");
     break;
   }
 }

 if ((ret = init_webm_encoders(audioBitRate, crf, videoMaxBitRate, threads, quality, speed, pass, stats)) < 0)
   goto end;

 if ((ret = open_output_file(outputPath)) < 0)
   goto end;

 if ((ret = init_filters(AV_CODEC_ID_VORBIS)) < 0)
   goto end;

 if ((ret = transcode()) < 0)
   goto end;
   }

   if (remove(STATS_LOG) != 0) {
 av_log(NULL, AV_LOG_ERROR, "Failed to remove %s\n", STATS_LOG);
   }

  end:
   if (ret < 0) {
 av_log(NULL, AV_LOG_ERROR, "Error occurred: %s\n", av_err2str(ret));
 return ret;
   }

   return 0;
 }

Here is the output from the ffmpeg command I am trying to mimic.

ffmpeg version N-62301-g59a5384 Copyright (c) 2000-2014 the FFmpeg developers
  built on Apr  9 2014 09:58:44 with gcc 4.8.2 (GCC) 20140206 (prerelease)
  configuration: --prefix=/opt/ffmpeg --extra-cflags=-I/opt/x264/include --extra-ldflags=-L/opt/x264/lib --extra-libs=-ldl --enable-gpl --enable-nonfree --enable-libfdk-aac --enable-libopus --enable-libvorbis --enable-libvpx --enable-libx264
  libavutil      52. 75.100 / 52. 75.100
  libavcodec     55. 58.103 / 55. 58.103
  libavformat    55. 36.102 / 55. 36.102
  libavdevice    55. 11.100 / 55. 11.100
  libavfilter     4.  3.100 /  4.  3.100
  libswscale      2.  6.100 /  2.  6.100
  libswresample   0. 18.100 /  0. 18.100
  libpostproc    52.  3.100 / 52.  3.100
Input #0, matroska,webm, from '/mnt/scratch/test_source/Sintel.2010.720p.mkv':
  Metadata:
encoder         : libebml v1.0.0 + libmatroska v1.0.0
creation_time   : 2011-04-24 17:20:33
  Duration: 00:14:48.03, start: 0.000000, bitrate: 6071 kb/s
Chapter #0.0: start 0.000000, end 103.125000
Metadata:
  title           : Chapter 01
Chapter #0.1: start 103.125000, end 148.667000
Metadata:
  title           : Chapter 02
Chapter #0.2: start 148.667000, end 349.792000
Metadata:
  title           : Chapter 03
Chapter #0.3: start 349.792000, end 437.208000
Metadata:
  title           : Chapter 04
Chapter #0.4: start 437.208000, end 472.075000
Metadata:
  title           : Chapter 05
Chapter #0.5: start 472.075000, end 678.833000
Metadata:
  title           : Chapter 06
Chapter #0.6: start 678.833000, end 744.083000
Metadata:
  title           : Chapter 07
Chapter #0.7: start 744.083000, end 888.032000
Metadata:
  title           : Chapter 08
Stream #0:0(eng): Video: h264 (High), yuv420p(tv, bt709), 1280x544, SAR 1:1 DAR 40:17, 24 fps, 24 tbr, 1k tbn, 48 tbc
Stream #0:1(eng): Audio: ac3, 48000 Hz, 5.1(side), fltp, 640 kb/s
Metadata:
  title           : AC3 5.1 @ 640 Kbps
Stream #0:2(ger): Subtitle: subrip
Stream #0:3(eng): Subtitle: subrip
Stream #0:4(spa): Subtitle: subrip
Stream #0:5(fre): Subtitle: subrip
Stream #0:6(ita): Subtitle: subrip
Stream #0:7(dut): Subtitle: subrip
Stream #0:8(pol): Subtitle: subrip
Stream #0:9(por): Subtitle: subrip
Stream #0:10(rus): Subtitle: subrip
Stream #0:11(vie): Subtitle: subrip
[libvpx @ 0x24b74c0] v1.3.0
Output #0, webm, to '/mnt/scratch/test_out/Sintel.2010.720p.script.webm':
  Metadata:
encoder         : Lavf55.36.102
Chapter #0.0: start 0.000000, end 103.125000
Metadata:
  title           : Chapter 01
Chapter #0.1: start 103.125000, end 148.667000
Metadata:
  title           : Chapter 02
Chapter #0.2: start 148.667000, end 349.792000
Metadata:
  title           : Chapter 03
Chapter #0.3: start 349.792000, end 437.208000
Metadata:
  title           : Chapter 04
Chapter #0.4: start 437.208000, end 472.075000
Metadata:
  title           : Chapter 05
Chapter #0.5: start 472.075000, end 678.833000
Metadata:
  title           : Chapter 06
Chapter #0.6: start 678.833000, end 744.083000
Metadata:
  title           : Chapter 07
Chapter #0.7: start 744.083000, end 888.032000
Metadata:
  title           : Chapter 08
Stream #0:0(eng): Video: vp8 (libvpx), yuv420p, 1280x544 [SAR 1:1 DAR 40:17], q=-1--1, pass 2, 60000 kb/s, 1k tbn, 24 tbc
Stream #0:1(eng): Audio: vorbis (libvorbis), 48000 Hz, 5.1(side), fltp, 384 kb/s
Metadata:
  title           : AC3 5.1 @ 640 Kbps
Stream mapping:
  Stream #0:0 -> #0:0 (h264 -> libvpx)
  Stream #0:1 -> #0:1 (ac3 -> libvorbis)
Press [q] to stop, [?] for help
frame=21312 fps= 11 q=0.0 Lsize=  567191kB time=00:14:48.01 bitrate=5232.4kbits/s    
video:537377kB audio:29266kB subtitle:0kB other streams:0kB global headers:7kB muxing overhead: 0.096885%

Solution

It appears that your problem is in your transcoding function you need to av_rescale_q between your input format PTS/DTS and output format PTS/DTS.

Don't trust putting arbitrary numbers in either, continue doing what you're doing by getting it from the context and codec.

I wrote a bit about PTS/DTS here and it may be useful for you. In your case though you have a "correct" PTS/DTS already that you need to transform into your output PTS/DTS.

Licensed under: CC-BY-SA with attribution

Not affiliated with StackOverflow