diff --git a/src/invidious/routes/api/v1/videos.cr b/src/invidious/routes/api/v1/videos.cr
index 9281f4dd..faff2f59 100644
--- a/src/invidious/routes/api/v1/videos.cr
+++ b/src/invidious/routes/api/v1/videos.cr
@@ -89,9 +89,14 @@ module Invidious::Routes::API::V1::Videos
 
     if CONFIG.use_innertube_for_captions
       params = Invidious::Videos::Transcript.generate_param(id, caption.language_code, caption.auto_generated)
-      initial_data = YoutubeAPI.get_transcript(params)
 
-      webvtt = Invidious::Videos::Transcript.convert_transcripts_to_vtt(initial_data, caption.language_code)
+      transcript = Invidious::Videos::Transcript.from_raw(
+        YoutubeAPI.get_transcript(params),
+        caption.language_code,
+        caption.auto_generated
+      )
+
+      webvtt = transcript.to_vtt
     else
       # Timedtext API handling
       url = URI.parse("#{caption.base_url}&tlang=#{tlang}").request_target
diff --git a/src/invidious/videos/transcript.cr b/src/invidious/videos/transcript.cr
index dac00eea..9cd064c5 100644
--- a/src/invidious/videos/transcript.cr
+++ b/src/invidious/videos/transcript.cr
@@ -1,8 +1,26 @@
 module Invidious::Videos
-  # Namespace for methods primarily relating to Transcripts
-  module Transcript
-    record TranscriptLine, start_ms : Time::Span, end_ms : Time::Span, line : String
+  # A `Transcripts` struct encapsulates a sequence of lines that together forms the whole transcript for a given YouTube video.
+  # These lines can be categorized into two types: section headings and regular lines representing content from the video.
+  struct Transcript
+    # Types
+    record HeadingLine, start_ms : Time::Span, end_ms : Time::Span, line : String
+    record RegularLine, start_ms : Time::Span, end_ms : Time::Span, line : String
+    alias TranscriptLine = HeadingLine | RegularLine
 
+    property lines : Array(TranscriptLine)
+
+    property language_code : String
+    property auto_generated : Bool
+
+    # User friendly label for the current transcript.
+    # Example: "English (auto-generated)"
+    property label : String
+
+    # Initializes a new Transcript struct with the contents and associated metadata describing it
+    def initialize(@lines : Array(TranscriptLine), @language_code : String, @auto_generated : Bool, @label : String)
+    end
+
+    # Generates a protobuf string to fetch the requested transcript from YouTube
     def self.generate_param(video_id : String, language_code : String, auto_generated : Bool) : String
       kind = auto_generated ? "asr" : ""
 
@@ -30,48 +48,79 @@ module Invidious::Videos
       return params
     end
 
-    def self.convert_transcripts_to_vtt(initial_data : Hash(String, JSON::Any), target_language : String) : String
-      # Convert into array of TranscriptLine
-      lines = self.parse(initial_data)
+    # Constructs a Transcripts struct from the initial YouTube response
+    def self.from_raw(initial_data : Hash(String, JSON::Any), language_code : String, auto_generated : Bool)
+      transcript_panel = initial_data.dig("actions", 0, "updateEngagementPanelAction", "content", "transcriptRenderer",
+        "content", "transcriptSearchPanelRenderer")
 
+      segment_list = transcript_panel.dig("body", "transcriptSegmentListRenderer")
+
+      if !segment_list["initialSegments"]?
+        raise NotFoundException.new("Requested transcript does not exist")
+      end
+
+      # Extract user-friendly label for the current transcript
+
+      footer_language_menu = transcript_panel.dig?(
+        "footer", "transcriptFooterRenderer", "languageMenu", "sortFilterSubMenuRenderer", "subMenuItems"
+      )
+
+      if footer_language_menu
+        label = footer_language_menu.as_a.select(&.["selected"].as_bool)[0]["title"].as_s
+      else
+        label = language_code
+      end
+
+      # Extract transcript lines
+
+      initial_segments = segment_list["initialSegments"].as_a
+
+      lines = [] of TranscriptLine
+
+      initial_segments.each do |line|
+        if unpacked_line = line["transcriptSectionHeaderRenderer"]?
+          line_type = HeadingLine
+        else
+          unpacked_line = line["transcriptSegmentRenderer"]
+          line_type = RegularLine
+        end
+
+        start_ms = unpacked_line["startMs"].as_s.to_i.millisecond
+        end_ms = unpacked_line["endMs"].as_s.to_i.millisecond
+        text = extract_text(unpacked_line["snippet"]) || ""
+
+        lines << line_type.new(start_ms, end_ms, text)
+      end
+
+      return Transcript.new(
+        lines: lines,
+        language_code: language_code,
+        auto_generated: auto_generated,
+        label: label
+      )
+    end
+
+    # Converts transcript lines to a WebVTT file
+    #
+    # This is used within Invidious to replace subtitles
+    # as to workaround YouTube's rate-limited timedtext endpoint.
+    def to_vtt
       settings_field = {
         "Kind"     => "captions",
-        "Language" => target_language,
+        "Language" => @language_code,
       }
 
-      # Taken from Invidious::Videos::Captions::Metadata.timedtext_to_vtt()
       vtt = WebVTT.build(settings_field) do |vtt|
-        lines.each do |line|
+        @lines.each do |line|
+          # Section headers are excluded from the VTT conversion as to
+          # match the regular captions returned from YouTube as much as possible
+          next if line.is_a? HeadingLine
+
           vtt.cue(line.start_ms, line.end_ms, line.line)
         end
       end
 
       return vtt
     end
-
-    private def self.parse(initial_data : Hash(String, JSON::Any))
-      body = initial_data.dig("actions", 0, "updateEngagementPanelAction", "content", "transcriptRenderer",
-        "content", "transcriptSearchPanelRenderer", "body", "transcriptSegmentListRenderer",
-        "initialSegments").as_a
-
-      lines = [] of TranscriptLine
-      body.each do |line|
-        # Transcript section headers. They are not apart of the captions and as such we can safely skip them.
-        if line.as_h.has_key?("transcriptSectionHeaderRenderer")
-          next
-        end
-
-        line = line["transcriptSegmentRenderer"]
-
-        start_ms = line["startMs"].as_s.to_i.millisecond
-        end_ms = line["endMs"].as_s.to_i.millisecond
-
-        text = extract_text(line["snippet"]) || ""
-
-        lines << TranscriptLine.new(start_ms, end_ms, text)
-      end
-
-      return lines
-    end
   end
 end