add support for video-to-animated-image conversion with frame sampling

skidder · skidder · commit 73bcff18bae7 · 2025-10-31T00:21:09.000Z
Enables extraction of multiple frames from video sources (MP4, MOV, WEBM) at configurable
sample intervals to produce animated image outputs (animated WebP, GIF). Introduces
VideoFrameSampleInterval option to control frame sampling rate and extends decoder with
VideoDecoder interface for multi-frame extraction. Also improves memory alignment handling
for better SIMD performance.
diff --git a/avcodec.cpp b/avcodec.cpp
@@ -49,6 +49,13 @@ struct avcodec_decoder_struct {
     AVCodecContext* codec;
     AVIOContext* avio;
     int video_stream_index;
+
+    // Multi-frame extraction state
+    float frame_sample_interval;  // Interval between extracted frames in seconds
+    double next_frame_time;        // PTS of next frame to extract
+    double last_extracted_pts;     // PTS of last extracted frame
+    int frame_delay_ms;            // Delay for current frame in milliseconds
+    bool multi_frame_mode;         // Whether we're extracting multiple frames
 };
 
 static int avcodec_decoder_read_callback(void* d_void, uint8_t* buf, int buf_size)
@@ -472,29 +479,22 @@ bool avcodec_decoder_has_subtitles(const avcodec_decoder d)
     return false;
 }
 
-static int avcodec_decoder_copy_frame(const avcodec_decoder d, opencv_mat mat, AVFrame* frame)
+static int avcodec_decoder_convert_frame(const avcodec_decoder d, opencv_mat mat, AVFrame* frame)
 {
     if (!d || !d->codec || !d->codec->codec || !mat || !frame) {
         return -1;
     }
-    
+
     auto cvMat = static_cast<cv::Mat*>(mat);
     if (!cvMat) {
         return -1;
     }
 
-    int res = avcodec_receive_frame(d->codec, frame);
-    if (res >= 0) {
-        // Calculate the step size based on the cv::Mat's width
-        int stepSize =
-          4 * cvMat->cols; // Assuming the cv::Mat is in BGRA format, which has 4 channels
-        if (cvMat->cols % 32 != 0) {
-            int width = cvMat->cols + 32 - (cvMat->cols % 32);
-            stepSize = 4 * width;
-        }
-        if (!opencv_mat_set_row_stride(mat, stepSize)) {
-            return -1;
-        }
+    int res = 0;
+    {
+        // Use the cv::Mat's actual step (stride) instead of setting a custom one
+        // This ensures consistency with OpenCV operations and encoding
+        int stepSize = cvMat->step;
 
         // Create SwsContext for converting the frame format and scaling
         struct SwsContext* sws =
@@ -541,8 +541,8 @@ static int avcodec_decoder_copy_frame(const avcodec_decoder d, opencv_mat mat, A
         sws_setColorspaceDetails(sws, inv_table, srcRange, table, 1, 0, 1 << 16, 1 << 16);
 
         // The linesizes and data pointers for the destination
-        int dstLinesizes[4];
-        av_image_fill_linesizes(dstLinesizes, AV_PIX_FMT_BGRA, stepSize / 4);
+        // For BGRA, we only use the first plane, so use the Mat's actual stride
+        int dstLinesizes[4] = {stepSize, 0, 0, 0};
         uint8_t* dstData[4] = {cvMat->data, NULL, NULL, NULL};
 
         // Perform the scaling and format conversion
@@ -555,6 +555,15 @@ static int avcodec_decoder_copy_frame(const avcodec_decoder d, opencv_mat mat, A
     return res;
 }
 
+static int avcodec_decoder_copy_frame(const avcodec_decoder d, opencv_mat mat, AVFrame* frame)
+{
+    int res = avcodec_receive_frame(d->codec, frame);
+    if (res >= 0) {
+        return avcodec_decoder_convert_frame(d, mat, frame);
+    }
+    return res;
+}
+
 static int avcodec_decoder_decode_packet(const avcodec_decoder d, opencv_mat mat, AVPacket* packet)
 {
     int res = avcodec_send_packet(d->codec, packet);
@@ -578,7 +587,67 @@ bool avcodec_decoder_decode(const avcodec_decoder d, opencv_mat mat)
     if (!d || !d->container || !d->codec || !mat) {
         return false;
     }
+
     AVPacket packet;
+    AVStream* video_stream = d->container->streams[d->video_stream_index];
+
+    // If we're in multi-frame mode, we need to sample frames based on time
+    if (d->multi_frame_mode) {
+        AVFrame* frame = av_frame_alloc();
+        if (!frame) {
+            return false;
+        }
+
+        while (true) {
+            int res = av_read_frame(d->container, &packet);
+            if (res < 0) {
+                av_frame_free(&frame);
+                return false;
+            }
+
+            if (packet.stream_index != d->video_stream_index) {
+                av_packet_unref(&packet);
+                continue;
+            }
+
+            res = avcodec_send_packet(d->codec, &packet);
+            av_packet_unref(&packet);
+
+            if (res < 0) {
+                continue;
+            }
+
+            while (avcodec_receive_frame(d->codec, frame) == 0) {
+                double frame_time = -1.0;
+                if (frame->pts != AV_NOPTS_VALUE) {
+                    frame_time = frame->pts * av_q2d(video_stream->time_base);
+                }
+
+                // Check if this frame should be extracted based on sampling interval
+                if (frame_time >= 0 && frame_time >= d->next_frame_time) {
+                    // Calculate frame delay for animation
+                    if (d->last_extracted_pts >= 0) {
+                        d->frame_delay_ms = (int)((frame_time - d->last_extracted_pts) * 1000.0);
+                    } else {
+                        d->frame_delay_ms = (int)(d->frame_sample_interval * 1000.0);
+                    }
+
+                    d->last_extracted_pts = frame_time;
+                    d->next_frame_time = frame_time + d->frame_sample_interval;
+
+                    // Convert frame to output mat
+                    res = avcodec_decoder_convert_frame(d, mat, frame);
+                    av_frame_free(&frame);
+
+                    return (res >= 0);
+                }
+
+                av_frame_unref(frame);
+            }
+        }
+    }
+
+    // Single-frame mode: just decode the first video frame
     bool done = false;
     bool success = false;
     while (!done) {
@@ -601,6 +670,26 @@ bool avcodec_decoder_decode(const avcodec_decoder d, opencv_mat mat)
     return success;
 }
 
+void avcodec_decoder_set_frame_sample_interval(avcodec_decoder d, float interval_seconds)
+{
+    if (!d) {
+        return;
+    }
+    d->frame_sample_interval = interval_seconds;
+    d->next_frame_time = 0.0;
+    d->last_extracted_pts = -1.0;
+    d->frame_delay_ms = 0;
+    d->multi_frame_mode = (interval_seconds > 0.0);
+}
+
+int avcodec_decoder_get_frame_delay_ms(const avcodec_decoder d)
+{
+    if (!d) {
+        return 0;
+    }
+    return d->frame_delay_ms;
+}
+
 void avcodec_decoder_release(avcodec_decoder d)
 {
     if (d->codec) {
diff --git a/avcodec.go b/avcodec.go
@@ -22,13 +22,15 @@ var av1Enabled string
 
 // avCodecDecoder handles decoding of various video/image formats using FFmpeg's avcodec.
 type avCodecDecoder struct {
-	decoder      C.avcodec_decoder
-	mat          C.opencv_mat
-	buf          []byte
-	hasDecoded   bool
-	maybeMP4     bool
-	isStreamable bool
-	hasSubtitles bool
+	decoder             C.avcodec_decoder
+	mat                 C.opencv_mat
+	buf                 []byte
+	hasDecoded          bool
+	maybeMP4            bool
+	isStreamable        bool
+	hasSubtitles        bool
+	multiFrameMode      bool
+	frameSampleInterval float64
 }
 
 // newAVCodecDecoder creates a new decoder instance from the provided buffer.
@@ -130,22 +132,32 @@ func (d *avCodecDecoder) Duration() time.Duration {
 }
 
 // Header returns the image metadata including dimensions, pixel format, and orientation.
-// Frame count is always 1 since it requires the entire buffer to be decoded.
+// Frame count is 1 for single-frame mode, or estimated from duration and sample interval in multi-frame mode.
 func (d *avCodecDecoder) Header() (*ImageHeader, error) {
+	numFrames := 1
+	if d.multiFrameMode && d.frameSampleInterval > 0 {
+		// Estimate the number of frames based on duration and sample interval
+		duration := float64(C.avcodec_decoder_get_duration(d.decoder))
+		if duration > 0 {
+			numFrames = int(duration/d.frameSampleInterval) + 1
+		}
+	}
+
 	return &ImageHeader{
 		width:         int(C.avcodec_decoder_get_width(d.decoder)),
 		height:        int(C.avcodec_decoder_get_height(d.decoder)),
 		pixelType:     PixelType(C.CV_8UC4),
 		orientation:   ImageOrientation(C.avcodec_decoder_get_orientation(d.decoder)),
-		numFrames:     1,
+		numFrames:     numFrames,
 		contentLength: len(d.buf),
 	}, nil
 }
 
 // DecodeTo decodes the next frame into the provided Framebuffer.
 // Returns io.EOF when no more frames are available.
 func (d *avCodecDecoder) DecodeTo(f *Framebuffer) error {
-	if d.hasDecoded {
+	// In single-frame mode, only decode once
+	if !d.multiFrameMode && d.hasDecoded {
 		return io.EOF
 	}
 	h, err := d.Header()
@@ -156,16 +168,27 @@ func (d *avCodecDecoder) DecodeTo(f *Framebuffer) error {
 	if err != nil {
 		return err
 	}
+
+	// Call decode - it handles both single-frame and multi-frame modes internally
 	ret := C.avcodec_decoder_decode(d.decoder, f.mat)
 	if !ret {
-		return ErrDecodingFailed
+		return io.EOF
 	}
+
+	// Set frame properties
+	if d.multiFrameMode {
+		// Get the frame delay from the decoder
+		frameDelayMs := int(C.avcodec_decoder_get_frame_delay_ms(d.decoder))
+		f.duration = time.Duration(frameDelayMs) * time.Millisecond
+	} else {
+		f.duration = time.Duration(0)
+		d.hasDecoded = true
+	}
+
 	f.blend = NoBlend
 	f.dispose = DisposeToBackgroundColor
-	f.duration = time.Duration(0)
 	f.xOffset = 0
 	f.yOffset = 0
-	d.hasDecoded = true
 	return nil
 }
 
@@ -174,6 +197,14 @@ func (d *avCodecDecoder) SkipFrame() error {
 	return ErrSkipNotSupported
 }
 
+// SetFrameSampleInterval configures the decoder to extract frames at the specified
+// interval in seconds. This enables multi-frame extraction mode for videos.
+func (d *avCodecDecoder) SetFrameSampleInterval(intervalSeconds float64) {
+	d.multiFrameMode = true
+	d.frameSampleInterval = intervalSeconds
+	C.avcodec_decoder_set_frame_sample_interval(d.decoder, C.float(intervalSeconds))
+}
+
 // Close releases all resources associated with the decoder.
 func (d *avCodecDecoder) Close() {
 	C.avcodec_decoder_release(d.decoder)
diff --git a/avcodec.hpp b/avcodec.hpp
@@ -24,6 +24,8 @@ const char* avcodec_decoder_get_description(const avcodec_decoder d);
 const char* avcodec_decoder_get_video_codec(const avcodec_decoder d);
 const char* avcodec_decoder_get_audio_codec(const avcodec_decoder d);
 int avcodec_decoder_get_icc(const avcodec_decoder d, void* dest, size_t dest_len);
+void avcodec_decoder_set_frame_sample_interval(avcodec_decoder d, float interval_seconds);
+int avcodec_decoder_get_frame_delay_ms(const avcodec_decoder d);
 
 #ifdef __cplusplus
 }
diff --git a/lilliput.go b/lilliput.go
@@ -80,6 +80,16 @@ type Decoder interface {
 	AudioCodec() string
 }
 
+// VideoDecoder extends Decoder with video-specific functionality
+type VideoDecoder interface {
+	Decoder
+
+	// SetFrameSampleInterval configures the decoder to extract frames at the specified
+	// interval in seconds. For example, 0.1 means extract a frame every 100ms (10 FPS).
+	// This enables multi-frame extraction mode.
+	SetFrameSampleInterval(intervalSeconds float64)
+}
+
 // An Encoder compresses raw pixel data into a well-known image type.
 type Encoder interface {
 	// Encode encodes the pixel data in f into the dst provided to NewEncoder. Encode quality
diff --git a/opencv.go b/opencv.go
@@ -178,6 +178,12 @@ func (h *ImageHeader) IsAnimated() bool {
 	return h.numFrames > 1
 }
 
+// NumFrames returns the number of frames in the image.
+// Returns 1 for static images, >1 for animations.
+func (h *ImageHeader) NumFrames() int {
+	return h.numFrames
+}
+
 // HasAlpha returns true if the image has an alpha channel.
 func (h *ImageHeader) HasAlpha() bool {
 	return h.pixelType.Channels() == 4
diff --git a/ops.go b/ops.go
@@ -51,6 +51,12 @@ type ImageOptions struct {
 
 	// DisableAnimatedOutput controls the encoder behavior when given a multi-frame input
 	DisableAnimatedOutput bool
+
+	// VideoFrameSampleInterval controls the frame sampling rate for video inputs.
+	// For example, 0.1 means extract a frame every 100ms (10 FPS).
+	// If set to 0, only the first frame will be extracted (default behavior).
+	// This option only applies to video formats (MP4, MOV, WEBM).
+	VideoFrameSampleInterval float64
 }
 
 // ImageOps is a reusable object that can resize and encode images.
@@ -259,11 +265,16 @@ func (o *ImageOps) encodeEmpty(e Encoder, opt map[int]int) ([]byte, error) {
 
 // skipToEnd advances the decoder to the final frame of an animation.
 // Returns io.EOF when the end is reached or an error if seeking fails.
+// If the decoder doesn't support skipping, this is a no-op and returns io.EOF.
 func (o *ImageOps) skipToEnd(d Decoder) error {
 	var err error
 	for {
 		err = d.SkipFrame()
 		if err != nil {
+			// If skip is not supported, treat it as end-of-stream
+			if err == ErrSkipNotSupported {
+				return io.EOF
+			}
 			return err
 		}
 	}
@@ -395,6 +406,13 @@ func (o *ImageOps) transformCurrentFrame(d Decoder, opt *ImageOptions, inputHead
 // initializeTransform prepares for image transformation by reading the input header
 // and creating an appropriate encoder. Returns the header, encoder, and any error.
 func (o *ImageOps) initializeTransform(d Decoder, opt *ImageOptions, dst []byte) (*ImageHeader, Encoder, error) {
+	// Enable multi-frame video extraction if requested
+	if opt.VideoFrameSampleInterval > 0 {
+		if vd, ok := d.(VideoDecoder); ok {
+			vd.SetFrameSampleInterval(opt.VideoFrameSampleInterval)
+		}
+	}
+
 	inputHeader, err := d.Header()
 	if err != nil {
 		return nil, nil, err
diff --git a/video_to_animated_test.go b/video_to_animated_test.go

Original file line number	Diff line number	Diff line change
`@@ -24,6 +24,8 @@ const char* avcodec_decoder_get_description(const avcodec_decoder d);`
`24`	`24`	`const char* avcodec_decoder_get_video_codec(const avcodec_decoder d);`
`25`	`25`	`const char* avcodec_decoder_get_audio_codec(const avcodec_decoder d);`
`26`	`26`	`int avcodec_decoder_get_icc(const avcodec_decoder d, void* dest, size_t dest_len);`
	`27`	`+void avcodec_decoder_set_frame_sample_interval(avcodec_decoder d, float interval_seconds);`
	`28`	`+int avcodec_decoder_get_frame_delay_ms(const avcodec_decoder d);`
`27`	`29`
`28`	`30`	`#ifdef __cplusplus`
`29`	`31`	`}`