diff --git a/scripts/lib/vtt-parser.js b/scripts/lib/vtt-parser.js
new file mode 100644
index 0000000..be7e7bd
--- /dev/null
+++ b/scripts/lib/vtt-parser.js
@@ -0,0 +1,52 @@
+function parseVtt(vtt, options = {}) {
+  const { mergeConsecutive = false } = options;
+  const lines = vtt.split(/\r?\n/);
+  const cues = [];
+  let i = 0;
+
+  while (i < lines.length && !/-->/.test(lines[i])) i++;
+
+  while (i < lines.length) {
+    const timeLine = lines[i];
+    const timeMatch = timeLine.match(/(\d\d:\d\d:\d\d\.\d{3})\s+-->\s+(\d\d:\d\d:\d\d\.\d{3})/);
+    if (!timeMatch) { i++; continue; }
+    const start = timeMatch[1];
+    const end = timeMatch[2];
+    i++;
+
+    const textLines = [];
+    while (i < lines.length && lines[i].trim() !== '' && !/-->/.test(lines[i])) {
+      textLines.push(lines[i]);
+      i++;
+    }
+    const raw = textLines.join(' ').trim();
+    if (!raw) continue;
+
+    const speakerMatch = raw.match(/^<v\s+([^>]+)>([\s\S]*?)<\/v>$/);
+    if (speakerMatch) {
+      cues.push({ speaker: speakerMatch[1].trim(), text: speakerMatch[2].trim(), start, end });
+    } else {
+      cues.push({ speaker: null, text: raw, start, end });
+    }
+  }
+
+  if (!mergeConsecutive) return cues;
+
+  const merged = [];
+  for (const cue of cues) {
+    const last = merged[merged.length - 1];
+    if (last && last.speaker === cue.speaker) {
+      last.text = `${last.text} ${cue.text}`;
+      last.end = cue.end;
+    } else {
+      merged.push({ ...cue });
+    }
+  }
+  return merged;
+}
+
+function formatTranscript(cues) {
+  return cues.map(c => `${c.speaker || '[unknown]'}: ${c.text}`).join('\n');
+}
+
+module.exports = { parseVtt, formatTranscript };
diff --git a/scripts/test/vtt-parser.test.js b/scripts/test/vtt-parser.test.js
new file mode 100644
index 0000000..616ce72
--- /dev/null
+++ b/scripts/test/vtt-parser.test.js
@@ -0,0 +1,64 @@
+const { describe, it } = require('node:test');
+const assert = require('node:assert/strict');
+const { parseVtt } = require('../lib/vtt-parser.js');
+
+describe('parseVtt', () => {
+  it('parses single cue with speaker tag', () => {
+    const vtt = `WEBVTT
+
+00:00:01.000 --> 00:00:04.000
+<v Christian Kauer>Hallo zusammen.</v>
+`;
+    const result = parseVtt(vtt);
+    assert.deepEqual(result, [
+      { speaker: 'Christian Kauer', text: 'Hallo zusammen.', start: '00:00:01.000', end: '00:00:04.000' }
+    ]);
+  });
+
+  it('merges consecutive cues from same speaker', () => {
+    const vtt = `WEBVTT
+
+00:00:01.000 --> 00:00:04.000
+<v Frank Herberg>Wir können das prüfen.</v>
+
+00:00:04.500 --> 00:00:07.000
+<v Frank Herberg>Lass uns Borgstedt fragen.</v>
+`;
+    const result = parseVtt(vtt, { mergeConsecutive: true });
+    assert.equal(result.length, 1);
+    assert.equal(result[0].speaker, 'Frank Herberg');
+    assert.equal(result[0].text, 'Wir können das prüfen. Lass uns Borgstedt fragen.');
+  });
+
+  it('handles cues without speaker tag', () => {
+    const vtt = `WEBVTT
+
+00:00:01.000 --> 00:00:02.000
+[Hintergrundgeräusch]
+`;
+    const result = parseVtt(vtt);
+    assert.equal(result[0].speaker, null);
+    assert.equal(result[0].text, '[Hintergrundgeräusch]');
+  });
+
+  it('returns empty array for empty vtt', () => {
+    assert.deepEqual(parseVtt('WEBVTT\n\n'), []);
+  });
+});
+
+describe('formatTranscript', () => {
+  const { formatTranscript } = require('../lib/vtt-parser.js');
+
+  it('formats cues as speaker: text lines', () => {
+    const cues = [
+      { speaker: 'Christian Kauer', text: 'Hallo.' },
+      { speaker: 'Frank Herberg', text: 'Hi.' }
+    ];
+    assert.equal(formatTranscript(cues), 'Christian Kauer: Hallo.\nFrank Herberg: Hi.');
+  });
+
+  it('uses [unknown] for null speaker', () => {
+    const cues = [{ speaker: null, text: '[Lärm]' }];
+    assert.equal(formatTranscript(cues), '[unknown]: [Lärm]');
+  });
+});