Skip to content

Commit 73bcff1

Browse files
committed
add support for video-to-animated-image conversion with frame sampling
Enables extraction of multiple frames from video sources (MP4, MOV, WEBM) at configurable sample intervals to produce animated image outputs (animated WebP, GIF). Introduces VideoFrameSampleInterval option to control frame sampling rate and extends decoder with VideoDecoder interface for multi-frame extraction. Also improves memory alignment handling for better SIMD performance.
1 parent 25e9d6c commit 73bcff1

File tree

7 files changed

+588
-29
lines changed

7 files changed

+588
-29
lines changed

avcodec.cpp

Lines changed: 105 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -49,6 +49,13 @@ struct avcodec_decoder_struct {
4949
AVCodecContext* codec;
5050
AVIOContext* avio;
5151
int video_stream_index;
52+
53+
// Multi-frame extraction state
54+
float frame_sample_interval; // Interval between extracted frames in seconds
55+
double next_frame_time; // PTS of next frame to extract
56+
double last_extracted_pts; // PTS of last extracted frame
57+
int frame_delay_ms; // Delay for current frame in milliseconds
58+
bool multi_frame_mode; // Whether we're extracting multiple frames
5259
};
5360

5461
static int avcodec_decoder_read_callback(void* d_void, uint8_t* buf, int buf_size)
@@ -472,29 +479,22 @@ bool avcodec_decoder_has_subtitles(const avcodec_decoder d)
472479
return false;
473480
}
474481

475-
static int avcodec_decoder_copy_frame(const avcodec_decoder d, opencv_mat mat, AVFrame* frame)
482+
static int avcodec_decoder_convert_frame(const avcodec_decoder d, opencv_mat mat, AVFrame* frame)
476483
{
477484
if (!d || !d->codec || !d->codec->codec || !mat || !frame) {
478485
return -1;
479486
}
480-
487+
481488
auto cvMat = static_cast<cv::Mat*>(mat);
482489
if (!cvMat) {
483490
return -1;
484491
}
485492

486-
int res = avcodec_receive_frame(d->codec, frame);
487-
if (res >= 0) {
488-
// Calculate the step size based on the cv::Mat's width
489-
int stepSize =
490-
4 * cvMat->cols; // Assuming the cv::Mat is in BGRA format, which has 4 channels
491-
if (cvMat->cols % 32 != 0) {
492-
int width = cvMat->cols + 32 - (cvMat->cols % 32);
493-
stepSize = 4 * width;
494-
}
495-
if (!opencv_mat_set_row_stride(mat, stepSize)) {
496-
return -1;
497-
}
493+
int res = 0;
494+
{
495+
// Use the cv::Mat's actual step (stride) instead of setting a custom one
496+
// This ensures consistency with OpenCV operations and encoding
497+
int stepSize = cvMat->step;
498498

499499
// Create SwsContext for converting the frame format and scaling
500500
struct SwsContext* sws =
@@ -541,8 +541,8 @@ static int avcodec_decoder_copy_frame(const avcodec_decoder d, opencv_mat mat, A
541541
sws_setColorspaceDetails(sws, inv_table, srcRange, table, 1, 0, 1 << 16, 1 << 16);
542542

543543
// The linesizes and data pointers for the destination
544-
int dstLinesizes[4];
545-
av_image_fill_linesizes(dstLinesizes, AV_PIX_FMT_BGRA, stepSize / 4);
544+
// For BGRA, we only use the first plane, so use the Mat's actual stride
545+
int dstLinesizes[4] = {stepSize, 0, 0, 0};
546546
uint8_t* dstData[4] = {cvMat->data, NULL, NULL, NULL};
547547

548548
// Perform the scaling and format conversion
@@ -555,6 +555,15 @@ static int avcodec_decoder_copy_frame(const avcodec_decoder d, opencv_mat mat, A
555555
return res;
556556
}
557557

558+
static int avcodec_decoder_copy_frame(const avcodec_decoder d, opencv_mat mat, AVFrame* frame)
559+
{
560+
int res = avcodec_receive_frame(d->codec, frame);
561+
if (res >= 0) {
562+
return avcodec_decoder_convert_frame(d, mat, frame);
563+
}
564+
return res;
565+
}
566+
558567
static int avcodec_decoder_decode_packet(const avcodec_decoder d, opencv_mat mat, AVPacket* packet)
559568
{
560569
int res = avcodec_send_packet(d->codec, packet);
@@ -578,7 +587,67 @@ bool avcodec_decoder_decode(const avcodec_decoder d, opencv_mat mat)
578587
if (!d || !d->container || !d->codec || !mat) {
579588
return false;
580589
}
590+
581591
AVPacket packet;
592+
AVStream* video_stream = d->container->streams[d->video_stream_index];
593+
594+
// If we're in multi-frame mode, we need to sample frames based on time
595+
if (d->multi_frame_mode) {
596+
AVFrame* frame = av_frame_alloc();
597+
if (!frame) {
598+
return false;
599+
}
600+
601+
while (true) {
602+
int res = av_read_frame(d->container, &packet);
603+
if (res < 0) {
604+
av_frame_free(&frame);
605+
return false;
606+
}
607+
608+
if (packet.stream_index != d->video_stream_index) {
609+
av_packet_unref(&packet);
610+
continue;
611+
}
612+
613+
res = avcodec_send_packet(d->codec, &packet);
614+
av_packet_unref(&packet);
615+
616+
if (res < 0) {
617+
continue;
618+
}
619+
620+
while (avcodec_receive_frame(d->codec, frame) == 0) {
621+
double frame_time = -1.0;
622+
if (frame->pts != AV_NOPTS_VALUE) {
623+
frame_time = frame->pts * av_q2d(video_stream->time_base);
624+
}
625+
626+
// Check if this frame should be extracted based on sampling interval
627+
if (frame_time >= 0 && frame_time >= d->next_frame_time) {
628+
// Calculate frame delay for animation
629+
if (d->last_extracted_pts >= 0) {
630+
d->frame_delay_ms = (int)((frame_time - d->last_extracted_pts) * 1000.0);
631+
} else {
632+
d->frame_delay_ms = (int)(d->frame_sample_interval * 1000.0);
633+
}
634+
635+
d->last_extracted_pts = frame_time;
636+
d->next_frame_time = frame_time + d->frame_sample_interval;
637+
638+
// Convert frame to output mat
639+
res = avcodec_decoder_convert_frame(d, mat, frame);
640+
av_frame_free(&frame);
641+
642+
return (res >= 0);
643+
}
644+
645+
av_frame_unref(frame);
646+
}
647+
}
648+
}
649+
650+
// Single-frame mode: just decode the first video frame
582651
bool done = false;
583652
bool success = false;
584653
while (!done) {
@@ -601,6 +670,26 @@ bool avcodec_decoder_decode(const avcodec_decoder d, opencv_mat mat)
601670
return success;
602671
}
603672

673+
void avcodec_decoder_set_frame_sample_interval(avcodec_decoder d, float interval_seconds)
674+
{
675+
if (!d) {
676+
return;
677+
}
678+
d->frame_sample_interval = interval_seconds;
679+
d->next_frame_time = 0.0;
680+
d->last_extracted_pts = -1.0;
681+
d->frame_delay_ms = 0;
682+
d->multi_frame_mode = (interval_seconds > 0.0);
683+
}
684+
685+
int avcodec_decoder_get_frame_delay_ms(const avcodec_decoder d)
686+
{
687+
if (!d) {
688+
return 0;
689+
}
690+
return d->frame_delay_ms;
691+
}
692+
604693
void avcodec_decoder_release(avcodec_decoder d)
605694
{
606695
if (d->codec) {

avcodec.go

Lines changed: 44 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -22,13 +22,15 @@ var av1Enabled string
2222

2323
// avCodecDecoder handles decoding of various video/image formats using FFmpeg's avcodec.
2424
type avCodecDecoder struct {
25-
decoder C.avcodec_decoder
26-
mat C.opencv_mat
27-
buf []byte
28-
hasDecoded bool
29-
maybeMP4 bool
30-
isStreamable bool
31-
hasSubtitles bool
25+
decoder C.avcodec_decoder
26+
mat C.opencv_mat
27+
buf []byte
28+
hasDecoded bool
29+
maybeMP4 bool
30+
isStreamable bool
31+
hasSubtitles bool
32+
multiFrameMode bool
33+
frameSampleInterval float64
3234
}
3335

3436
// newAVCodecDecoder creates a new decoder instance from the provided buffer.
@@ -130,22 +132,32 @@ func (d *avCodecDecoder) Duration() time.Duration {
130132
}
131133

132134
// Header returns the image metadata including dimensions, pixel format, and orientation.
133-
// Frame count is always 1 since it requires the entire buffer to be decoded.
135+
// Frame count is 1 for single-frame mode, or estimated from duration and sample interval in multi-frame mode.
134136
func (d *avCodecDecoder) Header() (*ImageHeader, error) {
137+
numFrames := 1
138+
if d.multiFrameMode && d.frameSampleInterval > 0 {
139+
// Estimate the number of frames based on duration and sample interval
140+
duration := float64(C.avcodec_decoder_get_duration(d.decoder))
141+
if duration > 0 {
142+
numFrames = int(duration/d.frameSampleInterval) + 1
143+
}
144+
}
145+
135146
return &ImageHeader{
136147
width: int(C.avcodec_decoder_get_width(d.decoder)),
137148
height: int(C.avcodec_decoder_get_height(d.decoder)),
138149
pixelType: PixelType(C.CV_8UC4),
139150
orientation: ImageOrientation(C.avcodec_decoder_get_orientation(d.decoder)),
140-
numFrames: 1,
151+
numFrames: numFrames,
141152
contentLength: len(d.buf),
142153
}, nil
143154
}
144155

145156
// DecodeTo decodes the next frame into the provided Framebuffer.
146157
// Returns io.EOF when no more frames are available.
147158
func (d *avCodecDecoder) DecodeTo(f *Framebuffer) error {
148-
if d.hasDecoded {
159+
// In single-frame mode, only decode once
160+
if !d.multiFrameMode && d.hasDecoded {
149161
return io.EOF
150162
}
151163
h, err := d.Header()
@@ -156,16 +168,27 @@ func (d *avCodecDecoder) DecodeTo(f *Framebuffer) error {
156168
if err != nil {
157169
return err
158170
}
171+
172+
// Call decode - it handles both single-frame and multi-frame modes internally
159173
ret := C.avcodec_decoder_decode(d.decoder, f.mat)
160174
if !ret {
161-
return ErrDecodingFailed
175+
return io.EOF
162176
}
177+
178+
// Set frame properties
179+
if d.multiFrameMode {
180+
// Get the frame delay from the decoder
181+
frameDelayMs := int(C.avcodec_decoder_get_frame_delay_ms(d.decoder))
182+
f.duration = time.Duration(frameDelayMs) * time.Millisecond
183+
} else {
184+
f.duration = time.Duration(0)
185+
d.hasDecoded = true
186+
}
187+
163188
f.blend = NoBlend
164189
f.dispose = DisposeToBackgroundColor
165-
f.duration = time.Duration(0)
166190
f.xOffset = 0
167191
f.yOffset = 0
168-
d.hasDecoded = true
169192
return nil
170193
}
171194

@@ -174,6 +197,14 @@ func (d *avCodecDecoder) SkipFrame() error {
174197
return ErrSkipNotSupported
175198
}
176199

200+
// SetFrameSampleInterval configures the decoder to extract frames at the specified
201+
// interval in seconds. This enables multi-frame extraction mode for videos.
202+
func (d *avCodecDecoder) SetFrameSampleInterval(intervalSeconds float64) {
203+
d.multiFrameMode = true
204+
d.frameSampleInterval = intervalSeconds
205+
C.avcodec_decoder_set_frame_sample_interval(d.decoder, C.float(intervalSeconds))
206+
}
207+
177208
// Close releases all resources associated with the decoder.
178209
func (d *avCodecDecoder) Close() {
179210
C.avcodec_decoder_release(d.decoder)

avcodec.hpp

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,8 @@ const char* avcodec_decoder_get_description(const avcodec_decoder d);
2424
const char* avcodec_decoder_get_video_codec(const avcodec_decoder d);
2525
const char* avcodec_decoder_get_audio_codec(const avcodec_decoder d);
2626
int avcodec_decoder_get_icc(const avcodec_decoder d, void* dest, size_t dest_len);
27+
void avcodec_decoder_set_frame_sample_interval(avcodec_decoder d, float interval_seconds);
28+
int avcodec_decoder_get_frame_delay_ms(const avcodec_decoder d);
2729

2830
#ifdef __cplusplus
2931
}

lilliput.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,16 @@ type Decoder interface {
8080
AudioCodec() string
8181
}
8282

83+
// VideoDecoder extends Decoder with video-specific functionality
84+
type VideoDecoder interface {
85+
Decoder
86+
87+
// SetFrameSampleInterval configures the decoder to extract frames at the specified
88+
// interval in seconds. For example, 0.1 means extract a frame every 100ms (10 FPS).
89+
// This enables multi-frame extraction mode.
90+
SetFrameSampleInterval(intervalSeconds float64)
91+
}
92+
8393
// An Encoder compresses raw pixel data into a well-known image type.
8494
type Encoder interface {
8595
// Encode encodes the pixel data in f into the dst provided to NewEncoder. Encode quality

opencv.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,12 @@ func (h *ImageHeader) IsAnimated() bool {
178178
return h.numFrames > 1
179179
}
180180

181+
// NumFrames returns the number of frames in the image.
182+
// Returns 1 for static images, >1 for animations.
183+
func (h *ImageHeader) NumFrames() int {
184+
return h.numFrames
185+
}
186+
181187
// HasAlpha returns true if the image has an alpha channel.
182188
func (h *ImageHeader) HasAlpha() bool {
183189
return h.pixelType.Channels() == 4

ops.go

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,12 @@ type ImageOptions struct {
5151

5252
// DisableAnimatedOutput controls the encoder behavior when given a multi-frame input
5353
DisableAnimatedOutput bool
54+
55+
// VideoFrameSampleInterval controls the frame sampling rate for video inputs.
56+
// For example, 0.1 means extract a frame every 100ms (10 FPS).
57+
// If set to 0, only the first frame will be extracted (default behavior).
58+
// This option only applies to video formats (MP4, MOV, WEBM).
59+
VideoFrameSampleInterval float64
5460
}
5561

5662
// ImageOps is a reusable object that can resize and encode images.
@@ -259,11 +265,16 @@ func (o *ImageOps) encodeEmpty(e Encoder, opt map[int]int) ([]byte, error) {
259265

260266
// skipToEnd advances the decoder to the final frame of an animation.
261267
// Returns io.EOF when the end is reached or an error if seeking fails.
268+
// If the decoder doesn't support skipping, this is a no-op and returns io.EOF.
262269
func (o *ImageOps) skipToEnd(d Decoder) error {
263270
var err error
264271
for {
265272
err = d.SkipFrame()
266273
if err != nil {
274+
// If skip is not supported, treat it as end-of-stream
275+
if err == ErrSkipNotSupported {
276+
return io.EOF
277+
}
267278
return err
268279
}
269280
}
@@ -395,6 +406,13 @@ func (o *ImageOps) transformCurrentFrame(d Decoder, opt *ImageOptions, inputHead
395406
// initializeTransform prepares for image transformation by reading the input header
396407
// and creating an appropriate encoder. Returns the header, encoder, and any error.
397408
func (o *ImageOps) initializeTransform(d Decoder, opt *ImageOptions, dst []byte) (*ImageHeader, Encoder, error) {
409+
// Enable multi-frame video extraction if requested
410+
if opt.VideoFrameSampleInterval > 0 {
411+
if vd, ok := d.(VideoDecoder); ok {
412+
vd.SetFrameSampleInterval(opt.VideoFrameSampleInterval)
413+
}
414+
}
415+
398416
inputHeader, err := d.Header()
399417
if err != nil {
400418
return nil, nil, err

0 commit comments

Comments
 (0)