diff --git a/src/invidious/routes/api/v1/videos.cr b/src/invidious/routes/api/v1/videos.cr index 9281f4dd..faff2f59 100644 --- a/src/invidious/routes/api/v1/videos.cr +++ b/src/invidious/routes/api/v1/videos.cr @@ -89,9 +89,14 @@ module Invidious::Routes::API::V1::Videos if CONFIG.use_innertube_for_captions params = Invidious::Videos::Transcript.generate_param(id, caption.language_code, caption.auto_generated) - initial_data = YoutubeAPI.get_transcript(params) - webvtt = Invidious::Videos::Transcript.convert_transcripts_to_vtt(initial_data, caption.language_code) + transcript = Invidious::Videos::Transcript.from_raw( + YoutubeAPI.get_transcript(params), + caption.language_code, + caption.auto_generated + ) + + webvtt = transcript.to_vtt else # Timedtext API handling url = URI.parse("#{caption.base_url}&tlang=#{tlang}").request_target diff --git a/src/invidious/videos/transcript.cr b/src/invidious/videos/transcript.cr index dac00eea..9cd064c5 100644 --- a/src/invidious/videos/transcript.cr +++ b/src/invidious/videos/transcript.cr @@ -1,8 +1,26 @@ module Invidious::Videos - # Namespace for methods primarily relating to Transcripts - module Transcript - record TranscriptLine, start_ms : Time::Span, end_ms : Time::Span, line : String + # A `Transcripts` struct encapsulates a sequence of lines that together forms the whole transcript for a given YouTube video. + # These lines can be categorized into two types: section headings and regular lines representing content from the video. + struct Transcript + # Types + record HeadingLine, start_ms : Time::Span, end_ms : Time::Span, line : String + record RegularLine, start_ms : Time::Span, end_ms : Time::Span, line : String + alias TranscriptLine = HeadingLine | RegularLine + property lines : Array(TranscriptLine) + + property language_code : String + property auto_generated : Bool + + # User friendly label for the current transcript. + # Example: "English (auto-generated)" + property label : String + + # Initializes a new Transcript struct with the contents and associated metadata describing it + def initialize(@lines : Array(TranscriptLine), @language_code : String, @auto_generated : Bool, @label : String) + end + + # Generates a protobuf string to fetch the requested transcript from YouTube def self.generate_param(video_id : String, language_code : String, auto_generated : Bool) : String kind = auto_generated ? "asr" : "" @@ -30,48 +48,79 @@ module Invidious::Videos return params end - def self.convert_transcripts_to_vtt(initial_data : Hash(String, JSON::Any), target_language : String) : String - # Convert into array of TranscriptLine - lines = self.parse(initial_data) + # Constructs a Transcripts struct from the initial YouTube response + def self.from_raw(initial_data : Hash(String, JSON::Any), language_code : String, auto_generated : Bool) + transcript_panel = initial_data.dig("actions", 0, "updateEngagementPanelAction", "content", "transcriptRenderer", + "content", "transcriptSearchPanelRenderer") + segment_list = transcript_panel.dig("body", "transcriptSegmentListRenderer") + + if !segment_list["initialSegments"]? + raise NotFoundException.new("Requested transcript does not exist") + end + + # Extract user-friendly label for the current transcript + + footer_language_menu = transcript_panel.dig?( + "footer", "transcriptFooterRenderer", "languageMenu", "sortFilterSubMenuRenderer", "subMenuItems" + ) + + if footer_language_menu + label = footer_language_menu.as_a.select(&.["selected"].as_bool)[0]["title"].as_s + else + label = language_code + end + + # Extract transcript lines + + initial_segments = segment_list["initialSegments"].as_a + + lines = [] of TranscriptLine + + initial_segments.each do |line| + if unpacked_line = line["transcriptSectionHeaderRenderer"]? + line_type = HeadingLine + else + unpacked_line = line["transcriptSegmentRenderer"] + line_type = RegularLine + end + + start_ms = unpacked_line["startMs"].as_s.to_i.millisecond + end_ms = unpacked_line["endMs"].as_s.to_i.millisecond + text = extract_text(unpacked_line["snippet"]) || "" + + lines << line_type.new(start_ms, end_ms, text) + end + + return Transcript.new( + lines: lines, + language_code: language_code, + auto_generated: auto_generated, + label: label + ) + end + + # Converts transcript lines to a WebVTT file + # + # This is used within Invidious to replace subtitles + # as to workaround YouTube's rate-limited timedtext endpoint. + def to_vtt settings_field = { "Kind" => "captions", - "Language" => target_language, + "Language" => @language_code, } - # Taken from Invidious::Videos::Captions::Metadata.timedtext_to_vtt() vtt = WebVTT.build(settings_field) do |vtt| - lines.each do |line| + @lines.each do |line| + # Section headers are excluded from the VTT conversion as to + # match the regular captions returned from YouTube as much as possible + next if line.is_a? HeadingLine + vtt.cue(line.start_ms, line.end_ms, line.line) end end return vtt end - - private def self.parse(initial_data : Hash(String, JSON::Any)) - body = initial_data.dig("actions", 0, "updateEngagementPanelAction", "content", "transcriptRenderer", - "content", "transcriptSearchPanelRenderer", "body", "transcriptSegmentListRenderer", - "initialSegments").as_a - - lines = [] of TranscriptLine - body.each do |line| - # Transcript section headers. They are not apart of the captions and as such we can safely skip them. - if line.as_h.has_key?("transcriptSectionHeaderRenderer") - next - end - - line = line["transcriptSegmentRenderer"] - - start_ms = line["startMs"].as_s.to_i.millisecond - end_ms = line["endMs"].as_s.to_i.millisecond - - text = extract_text(line["snippet"]) || "" - - lines << TranscriptLine.new(start_ms, end_ms, text) - end - - return lines - end end end