scuffle_transmuxer/
lib.rs

1//! A crate for transmuxing video streams.
2//!
3//! ## Status
4//!
5//! This crate is currently under development and is not yet stable.
6//!
7//! Unit tests are not yet fully implemented. Use at your own risk.
8//!
9//! ## License
10//!
11//! This project is licensed under the [MIT](./LICENSE.MIT) or [Apache-2.0](./LICENSE.Apache-2.0) license.
12//! You can choose between one of them if you use this work.
13//!
14//! `SPDX-License-Identifier: MIT OR Apache-2.0`
15#![allow(clippy::single_match)]
16// #![deny(missing_docs)]
17#![deny(unsafe_code)]
18#![deny(unreachable_pub)]
19
20use std::collections::VecDeque;
21use std::fmt::Debug;
22use std::io;
23
24use byteorder::{BigEndian, ReadBytesExt};
25use bytes::{Buf, Bytes};
26use scuffle_flv::audio::AudioData;
27use scuffle_flv::audio::body::AudioTagBody;
28use scuffle_flv::audio::body::legacy::LegacyAudioTagBody;
29use scuffle_flv::audio::body::legacy::aac::AacAudioData;
30use scuffle_flv::audio::header::AudioTagHeader;
31use scuffle_flv::audio::header::legacy::{LegacyAudioTagHeader, SoundType};
32use scuffle_flv::script::{OnMetaData, ScriptData};
33use scuffle_flv::tag::{FlvTag, FlvTagData};
34use scuffle_flv::video::VideoData;
35use scuffle_flv::video::body::VideoTagBody;
36use scuffle_flv::video::body::enhanced::{ExVideoTagBody, VideoPacket, VideoPacketCodedFrames, VideoPacketSequenceStart};
37use scuffle_flv::video::body::legacy::LegacyVideoTagBody;
38use scuffle_flv::video::header::enhanced::VideoFourCc;
39use scuffle_flv::video::header::legacy::{LegacyVideoTagHeader, LegacyVideoTagHeaderAvcPacket};
40use scuffle_flv::video::header::{VideoFrameType, VideoTagHeader, VideoTagHeaderData};
41use scuffle_h264::Sps;
42use scuffle_mp4::BoxType;
43use scuffle_mp4::codec::{AudioCodec, VideoCodec};
44use scuffle_mp4::types::ftyp::{FourCC, Ftyp};
45use scuffle_mp4::types::hdlr::{HandlerType, Hdlr};
46use scuffle_mp4::types::mdat::Mdat;
47use scuffle_mp4::types::mdhd::Mdhd;
48use scuffle_mp4::types::mdia::Mdia;
49use scuffle_mp4::types::mfhd::Mfhd;
50use scuffle_mp4::types::minf::Minf;
51use scuffle_mp4::types::moof::Moof;
52use scuffle_mp4::types::moov::Moov;
53use scuffle_mp4::types::mvex::Mvex;
54use scuffle_mp4::types::mvhd::Mvhd;
55use scuffle_mp4::types::smhd::Smhd;
56use scuffle_mp4::types::stbl::Stbl;
57use scuffle_mp4::types::stco::Stco;
58use scuffle_mp4::types::stsc::Stsc;
59use scuffle_mp4::types::stsd::Stsd;
60use scuffle_mp4::types::stsz::Stsz;
61use scuffle_mp4::types::stts::Stts;
62use scuffle_mp4::types::tfdt::Tfdt;
63use scuffle_mp4::types::tfhd::Tfhd;
64use scuffle_mp4::types::tkhd::Tkhd;
65use scuffle_mp4::types::traf::Traf;
66use scuffle_mp4::types::trak::Trak;
67use scuffle_mp4::types::trex::Trex;
68use scuffle_mp4::types::trun::Trun;
69use scuffle_mp4::types::vmhd::Vmhd;
70
71mod codecs;
72mod define;
73mod errors;
74
75pub use define::*;
76pub use errors::TransmuxError;
77
78struct Tags<'a> {
79    video_sequence_header: Option<VideoSequenceHeader>,
80    audio_sequence_header: Option<AudioSequenceHeader>,
81    scriptdata_tag: Option<OnMetaData<'a>>,
82}
83
84#[derive(Debug, Clone)]
85pub struct Transmuxer<'a> {
86    // These durations are measured in timescales
87    /// sample_freq * 1000
88    audio_duration: u64,
89    /// fps * 1000
90    video_duration: u64,
91    sequence_number: u32,
92    last_video_timestamp: u32,
93    settings: Option<(VideoSettings, AudioSettings)>,
94    tags: VecDeque<FlvTag<'a>>,
95}
96
97impl Default for Transmuxer<'_> {
98    fn default() -> Self {
99        Self::new()
100    }
101}
102
103impl<'a> Transmuxer<'a> {
104    pub fn new() -> Self {
105        Self {
106            sequence_number: 1,
107            tags: VecDeque::new(),
108            audio_duration: 0,
109            video_duration: 0,
110            last_video_timestamp: 0,
111            settings: None,
112        }
113    }
114
115    /// Feed raw FLV data to the transmuxer.
116    pub fn demux(&mut self, data: Bytes) -> Result<(), TransmuxError> {
117        let mut cursor = io::Cursor::new(data);
118        while cursor.has_remaining() {
119            cursor.read_u32::<BigEndian>()?; // previous tag size
120            if !cursor.has_remaining() {
121                break;
122            }
123
124            let tag = FlvTag::demux(&mut cursor)?;
125            self.tags.push_back(tag);
126        }
127
128        Ok(())
129    }
130
131    /// Feed a single FLV tag to the transmuxer.
132    pub fn add_tag(&mut self, tag: FlvTag<'a>) {
133        self.tags.push_back(tag);
134    }
135
136    /// Get the next transmuxed packet. This will return `None` if there is not
137    /// enough data to create a packet.
138    pub fn mux(&mut self) -> Result<Option<TransmuxResult>, TransmuxError> {
139        let mut writer = Vec::new();
140
141        let Some((video_settings, _)) = &self.settings else {
142            let Some((video_settings, audio_settings)) = self.init_sequence(&mut writer)? else {
143                if self.tags.len() > 30 {
144                    // We are clearly not getting any sequence headers, so we should just give up
145                    return Err(TransmuxError::NoSequenceHeaders);
146                }
147
148                // We don't have enough tags to create an init segment yet
149                return Ok(None);
150            };
151
152            self.settings = Some((video_settings.clone(), audio_settings.clone()));
153
154            return Ok(Some(TransmuxResult::InitSegment {
155                data: Bytes::from(writer),
156                audio_settings,
157                video_settings,
158            }));
159        };
160
161        loop {
162            let Some(tag) = self.tags.pop_front() else {
163                return Ok(None);
164            };
165
166            let mdat_data;
167            let total_duration;
168            let trun_sample;
169            let mut is_audio = false;
170            let mut is_keyframe = false;
171
172            let duration =
173                if self.last_video_timestamp == 0 || tag.timestamp_ms == 0 || tag.timestamp_ms < self.last_video_timestamp {
174                    1000 // the first frame is always 1000 ticks where the
175                // timescale is 1000 * fps.
176                } else {
177                    // Since the delta is in milliseconds (ie 1/1000 of a second)
178                    // Rounding errors happen. Our presision is only 1/1000 of a second.
179                    // So if we have a 30fps video the delta should be 33.33ms (1000/30)
180                    // But we can only represent this as 33ms or 34ms. So we will get rounding
181                    // errors. To fix this we just check if the delta is 1 more or 1 less than the
182                    // expected delta. And if it is we just use the expected delta.
183                    // The reason we use a timescale which is 1000 * fps is because then we can
184                    // always represent the delta as an integer. If we use a timescale of 1000, we
185                    // would run into the same rounding errors.
186                    let delta = tag.timestamp_ms as f64 - self.last_video_timestamp as f64;
187                    let expected_delta = 1000.0 / video_settings.framerate;
188                    if (delta - expected_delta).abs() <= 1.0 {
189                        1000
190                    } else {
191                        (delta * video_settings.framerate) as u32
192                    }
193                };
194
195            match tag.data {
196                FlvTagData::Audio(AudioData {
197                    body: AudioTagBody::Legacy(LegacyAudioTagBody::Aac(AacAudioData::Raw(data))),
198                    ..
199                }) => {
200                    let (sample, duration) = codecs::aac::trun_sample(&data)?;
201
202                    trun_sample = sample;
203                    mdat_data = data;
204                    total_duration = duration;
205                    is_audio = true;
206                }
207                FlvTagData::Video(VideoData {
208                    header:
209                        VideoTagHeader {
210                            frame_type,
211                            data:
212                                VideoTagHeaderData::Legacy(LegacyVideoTagHeader::AvcPacket(
213                                    LegacyVideoTagHeaderAvcPacket::Nalu { composition_time_offset },
214                                )),
215                        },
216                    body: VideoTagBody::Legacy(LegacyVideoTagBody::Other { data }),
217                    ..
218                }) => {
219                    let composition_time =
220                        ((composition_time_offset as f64 * video_settings.framerate) / 1000.0).floor() * 1000.0;
221
222                    let sample = codecs::avc::trun_sample(frame_type, composition_time as u32, duration, &data)?;
223
224                    trun_sample = sample;
225                    total_duration = duration;
226                    mdat_data = data;
227
228                    is_keyframe = frame_type == VideoFrameType::KeyFrame;
229                }
230                FlvTagData::Video(VideoData {
231                    header: VideoTagHeader { frame_type, .. },
232                    body:
233                        VideoTagBody::Enhanced(ExVideoTagBody::NoMultitrack {
234                            video_four_cc: VideoFourCc::Av1,
235                            packet: VideoPacket::CodedFrames(VideoPacketCodedFrames::Other(data)),
236                        }),
237                    ..
238                }) => {
239                    let sample = codecs::av1::trun_sample(frame_type, duration, &data)?;
240
241                    trun_sample = sample;
242                    total_duration = duration;
243                    mdat_data = data;
244
245                    is_keyframe = frame_type == VideoFrameType::KeyFrame;
246                }
247                FlvTagData::Video(VideoData {
248                    header: VideoTagHeader { frame_type, .. },
249                    body:
250                        VideoTagBody::Enhanced(ExVideoTagBody::NoMultitrack {
251                            video_four_cc: VideoFourCc::Hevc,
252                            packet,
253                        }),
254                    ..
255                }) => {
256                    let (composition_time, data) = match packet {
257                        VideoPacket::CodedFrames(VideoPacketCodedFrames::Hevc {
258                            composition_time_offset,
259                            data,
260                        }) => (Some(composition_time_offset), data),
261                        VideoPacket::CodedFramesX { data } => (None, data),
262                        _ => continue,
263                    };
264
265                    let composition_time =
266                        ((composition_time.unwrap_or_default() as f64 * video_settings.framerate) / 1000.0).floor() * 1000.0;
267
268                    let sample = codecs::hevc::trun_sample(frame_type, composition_time as i32, duration, &data)?;
269
270                    trun_sample = sample;
271                    total_duration = duration;
272                    mdat_data = data;
273
274                    is_keyframe = frame_type == VideoFrameType::KeyFrame;
275                }
276                _ => {
277                    // We don't support anything else
278                    continue;
279                }
280            }
281
282            let trafs = {
283                let (main_duration, main_id) = if is_audio {
284                    (self.audio_duration, 2)
285                } else {
286                    (self.video_duration, 1)
287                };
288
289                let mut traf = Traf::new(
290                    Tfhd::new(main_id, None, None, None, None, None),
291                    Some(Trun::new(vec![trun_sample], None)),
292                    Some(Tfdt::new(main_duration)),
293                );
294                traf.optimize();
295
296                vec![traf]
297            };
298
299            let mut moof = Moof::new(Mfhd::new(self.sequence_number), trafs);
300
301            // We need to get the moof size so that we can set the data offsets.
302            let moof_size = moof.size();
303
304            // We just created the moof, and therefore we know that the first traf is the
305            // video traf and the second traf is the audio traf. So we can just unwrap them
306            // and set the data offsets.
307            let traf = moof.traf.get_mut(0).expect("we just created the moof with a traf");
308
309            // Again we know that these exist because we just created it.
310            let trun = traf.trun.as_mut().expect("we just created the video traf with a trun");
311
312            // We now define the offsets.
313            // So the video offset will be the size of the moof + 8 bytes for the mdat
314            // header.
315            trun.data_offset = Some(moof_size as i32 + 8);
316
317            // We then write the moof to the writer.
318            moof.mux(&mut writer)?;
319
320            // We create an mdat box and write it to the writer.
321            Mdat::new(vec![mdat_data]).mux(&mut writer)?;
322
323            // Increase our sequence number and duration.
324            self.sequence_number += 1;
325
326            if is_audio {
327                self.audio_duration += total_duration as u64;
328                return Ok(Some(TransmuxResult::MediaSegment(MediaSegment {
329                    data: Bytes::from(writer),
330                    ty: MediaType::Audio,
331                    keyframe: false,
332                    timestamp: self.audio_duration - total_duration as u64,
333                })));
334            } else {
335                self.video_duration += total_duration as u64;
336                self.last_video_timestamp = tag.timestamp_ms;
337                return Ok(Some(TransmuxResult::MediaSegment(MediaSegment {
338                    data: Bytes::from(writer),
339                    ty: MediaType::Video,
340                    keyframe: is_keyframe,
341                    timestamp: self.video_duration - total_duration as u64,
342                })));
343            }
344        }
345    }
346
347    /// Internal function to find the tags we need to create the init segment.
348    fn find_tags(&self) -> Tags<'a> {
349        let tags = self.tags.iter();
350        let mut video_sequence_header = None;
351        let mut audio_sequence_header = None;
352        let mut scriptdata_tag = None;
353
354        for tag in tags {
355            if video_sequence_header.is_some() && audio_sequence_header.is_some() && scriptdata_tag.is_some() {
356                break;
357            }
358
359            match &tag.data {
360                FlvTagData::Video(VideoData {
361                    body: VideoTagBody::Legacy(LegacyVideoTagBody::AvcVideoPacketSeqHdr(data)),
362                    ..
363                }) => {
364                    video_sequence_header = Some(VideoSequenceHeader::Avc(data.clone()));
365                }
366                FlvTagData::Video(VideoData {
367                    body:
368                        VideoTagBody::Enhanced(ExVideoTagBody::NoMultitrack {
369                            video_four_cc: VideoFourCc::Av1,
370                            packet: VideoPacket::SequenceStart(VideoPacketSequenceStart::Av1(config)),
371                        }),
372                    ..
373                }) => {
374                    video_sequence_header = Some(VideoSequenceHeader::Av1(config.clone()));
375                }
376                FlvTagData::Video(VideoData {
377                    body:
378                        VideoTagBody::Enhanced(ExVideoTagBody::NoMultitrack {
379                            video_four_cc: VideoFourCc::Hevc,
380                            packet: VideoPacket::SequenceStart(VideoPacketSequenceStart::Hevc(config)),
381                        }),
382                    ..
383                }) => {
384                    video_sequence_header = Some(VideoSequenceHeader::Hevc(config.clone()));
385                }
386                FlvTagData::Audio(AudioData {
387                    body: AudioTagBody::Legacy(LegacyAudioTagBody::Aac(AacAudioData::SequenceHeader(data))),
388                    header:
389                        AudioTagHeader::Legacy(LegacyAudioTagHeader {
390                            sound_size, sound_type, ..
391                        }),
392                    ..
393                }) => {
394                    audio_sequence_header = Some(AudioSequenceHeader {
395                        data: AudioSequenceHeaderData::Aac(data.clone()),
396                        sound_size: *sound_size,
397                        sound_type: *sound_type,
398                    });
399                }
400                FlvTagData::ScriptData(ScriptData::OnMetaData(metadata)) => {
401                    scriptdata_tag = Some(*metadata.clone());
402                }
403                _ => {}
404            }
405        }
406
407        Tags {
408            video_sequence_header,
409            audio_sequence_header,
410            scriptdata_tag,
411        }
412    }
413
414    /// Create the init segment.
415    fn init_sequence(
416        &mut self,
417        writer: &mut impl io::Write,
418    ) -> Result<Option<(VideoSettings, AudioSettings)>, TransmuxError> {
419        // We need to find the tag that is the video sequence header
420        // and the audio sequence header
421        let Tags {
422            video_sequence_header,
423            audio_sequence_header,
424            scriptdata_tag,
425        } = self.find_tags();
426
427        let Some(video_sequence_header) = video_sequence_header else {
428            return Ok(None);
429        };
430        let Some(audio_sequence_header) = audio_sequence_header else {
431            return Ok(None);
432        };
433
434        let video_codec;
435        let audio_codec;
436        let video_width;
437        let video_height;
438        let audio_channels;
439        let audio_sample_rate;
440        let mut video_fps = 0.0;
441
442        let mut estimated_video_bitrate = 0;
443        let mut estimated_audio_bitrate = 0;
444
445        if let Some(scriptdata_tag) = scriptdata_tag {
446            video_fps = scriptdata_tag.framerate.unwrap_or(0.0);
447            estimated_video_bitrate = scriptdata_tag.videodatarate.map(|v| (v * 1024.0) as u32).unwrap_or(0);
448            estimated_audio_bitrate = scriptdata_tag.audiodatarate.map(|v| (v * 1024.0) as u32).unwrap_or(0);
449        }
450
451        let mut compatable_brands = vec![FourCC::Iso5, FourCC::Iso6];
452
453        let video_stsd_entry = match video_sequence_header {
454            VideoSequenceHeader::Avc(config) => {
455                compatable_brands.push(FourCC::Avc1);
456                video_codec = VideoCodec::Avc {
457                    constraint_set: config.profile_compatibility,
458                    level: config.level_indication,
459                    profile: config.profile_indication,
460                };
461
462                let sps = Sps::parse_with_emulation_prevention(io::Cursor::new(&config.sps[0]))
463                    .map_err(|_| TransmuxError::InvalidAVCDecoderConfigurationRecord)?;
464                video_width = sps.width() as u32;
465                video_height = sps.height() as u32;
466
467                let frame_rate = sps.frame_rate();
468                if let Some(frame_rate) = frame_rate {
469                    video_fps = frame_rate;
470                }
471
472                codecs::avc::stsd_entry(config, &sps)?
473            }
474            VideoSequenceHeader::Av1(config) => {
475                compatable_brands.push(FourCC::Av01);
476                let (entry, seq_obu) = codecs::av1::stsd_entry(config)?;
477
478                video_height = seq_obu.max_frame_height as u32;
479                video_width = seq_obu.max_frame_width as u32;
480
481                let op_point = &seq_obu.operating_points[0];
482
483                video_codec = VideoCodec::Av1 {
484                    profile: seq_obu.seq_profile,
485                    level: op_point.seq_level_idx,
486                    tier: op_point.seq_tier,
487                    depth: seq_obu.color_config.bit_depth as u8,
488                    monochrome: seq_obu.color_config.mono_chrome,
489                    sub_sampling_x: seq_obu.color_config.subsampling_x,
490                    sub_sampling_y: seq_obu.color_config.subsampling_y,
491                    color_primaries: seq_obu.color_config.color_primaries,
492                    transfer_characteristics: seq_obu.color_config.transfer_characteristics,
493                    matrix_coefficients: seq_obu.color_config.matrix_coefficients,
494                    full_range_flag: seq_obu.color_config.full_color_range,
495                };
496
497                entry
498            }
499            VideoSequenceHeader::Hevc(config) => {
500                compatable_brands.push(FourCC::Hev1);
501                video_codec = VideoCodec::Hevc {
502                    constraint_indicator: config.general_constraint_indicator_flags,
503                    level: config.general_level_idc,
504                    profile: config.general_profile_idc,
505                    profile_compatibility: config.general_profile_compatibility_flags,
506                    tier: config.general_tier_flag,
507                    general_profile_space: config.general_profile_space,
508                };
509
510                let (entry, sps) = codecs::hevc::stsd_entry(config)?;
511                if let Some(info) = sps.vui_parameters.as_ref().and_then(|p| p.vui_timing_info.as_ref()) {
512                    video_fps = info.time_scale.get() as f64 / info.num_units_in_tick.get() as f64;
513                }
514
515                video_width = sps.cropped_width() as u32;
516                video_height = sps.cropped_height() as u32;
517
518                entry
519            }
520        };
521
522        let audio_stsd_entry = match audio_sequence_header.data {
523            AudioSequenceHeaderData::Aac(data) => {
524                compatable_brands.push(FourCC::Mp41);
525                let (entry, config) =
526                    codecs::aac::stsd_entry(audio_sequence_header.sound_size, audio_sequence_header.sound_type, data)?;
527
528                audio_sample_rate = config.sampling_frequency;
529
530                audio_codec = AudioCodec::Aac {
531                    object_type: config.audio_object_type,
532                };
533                audio_channels = match audio_sequence_header.sound_type {
534                    SoundType::Mono => 1,
535                    SoundType::Stereo => 2,
536                    _ => return Err(TransmuxError::InvalidAudioChannels),
537                };
538
539                entry
540            }
541        };
542
543        if video_fps == 0.0 {
544            return Err(TransmuxError::InvalidVideoFrameRate);
545        }
546
547        if video_width == 0 || video_height == 0 {
548            return Err(TransmuxError::InvalidVideoDimensions);
549        }
550
551        if audio_sample_rate == 0 {
552            return Err(TransmuxError::InvalidAudioSampleRate);
553        }
554
555        // The reason we multiply the FPS by 1000 is to avoid rounding errors
556        // Consider If we had a video with a framerate of 30fps. That would imply each
557        // frame is 33.333333ms So we are limited to a u32 and therefore we could only
558        // represent 33.333333ms as 33ms. So this value is 30 * 1000 = 30000 timescale
559        // units per second, making each frame 1000 units long instead of 33ms long.
560        let video_timescale = (1000.0 * video_fps) as u32;
561
562        Ftyp::new(FourCC::Iso5, 512, compatable_brands).mux(writer)?;
563        Moov::new(
564            Mvhd::new(0, 0, 1000, 0, 1),
565            vec![
566                Trak::new(
567                    Tkhd::new(0, 0, 1, 0, Some((video_width, video_height))),
568                    None,
569                    Mdia::new(
570                        Mdhd::new(0, 0, video_timescale, 0),
571                        Hdlr::new(HandlerType::Vide, "VideoHandler".to_string()),
572                        Minf::new(
573                            Stbl::new(
574                                Stsd::new(vec![video_stsd_entry]),
575                                Stts::new(vec![]),
576                                Stsc::new(vec![]),
577                                Stco::new(vec![]),
578                                Some(Stsz::new(0, vec![])),
579                            ),
580                            Some(Vmhd::new()),
581                            None,
582                        ),
583                    ),
584                ),
585                Trak::new(
586                    Tkhd::new(0, 0, 2, 0, None),
587                    None,
588                    Mdia::new(
589                        Mdhd::new(0, 0, audio_sample_rate, 0),
590                        Hdlr::new(HandlerType::Soun, "SoundHandler".to_string()),
591                        Minf::new(
592                            Stbl::new(
593                                Stsd::new(vec![audio_stsd_entry]),
594                                Stts::new(vec![]),
595                                Stsc::new(vec![]),
596                                Stco::new(vec![]),
597                                Some(Stsz::new(0, vec![])),
598                            ),
599                            None,
600                            Some(Smhd::new()),
601                        ),
602                    ),
603                ),
604            ],
605            Some(Mvex::new(vec![Trex::new(1), Trex::new(2)], None)),
606        )
607        .mux(writer)?;
608
609        Ok(Some((
610            VideoSettings {
611                width: video_width,
612                height: video_height,
613                framerate: video_fps,
614                codec: video_codec,
615                bitrate: estimated_video_bitrate,
616                timescale: video_timescale,
617            },
618            AudioSettings {
619                codec: audio_codec,
620                sample_rate: audio_sample_rate,
621                channels: audio_channels,
622                bitrate: estimated_audio_bitrate,
623                timescale: audio_sample_rate,
624            },
625        )))
626    }
627}
628
629#[cfg(test)]
630mod tests;