1#!/usr/local/bin/perl -w
2
3# A script for pipelined editing of subtitle files.
4# Copyright (C) 2004 Michael Klepikov <mike72@mail.ru>
5#
6# Version 1.0  initial release  28-Mar-04
7#
8# Comments, suggestions -- send me an mail, but the recommended way is
9# to enhance/fix on your own and submit to the distribution;)
10# If you like, I can review the fixes.
11#
12# This script is free software; you can redistribute it and/or
13# modify it under the terms of the GNU Lesser General Public
14# License as published by the Free Software Foundation; either
15# version 2.1 of the License, or (at your option) any later version.
16# Retain original credits when modifying.
17#
18# This script is distributed in the hope that it will be useful,
19# but WITHOUT ANY WARRANTY; without even the implied warranty of
20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
21# Lesser General Public License for more details.
22#
23# You should have received a copy of the GNU Lesser General Public
24# License along with this library; if not, write to the Free Software
25# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
26#
27
28use Math::BigInt;
29
30# Constants
31my $FMT_UNKNOWN = 0;
32my $FMT_SRT = 1;
33
34# Argument values
35my $DEBUG = 0;
36my $inFormat;
37my $outFormat;
38my $shiftMilli;
39my $scaleMilli;
40my $splitFromMilli;
41my $splitToMilli;
42
43## Process command line
44while (defined ($argVal = shift)) {
45  if ($argVal eq "-d" || $argVal eq "--debug") {
46    $DEBUG = 1;
47  } elsif ($argVal eq "-if" || $argVal eq "--input-format") {
48    $inFormat = shift;
49    usage ("Must specify input format") if ! $inFormat;
50    if ($inFormat =~ /^srt/i) {
51      $inFormat = $FMT_SRT;
52    } else {
53      usage ("Invalid input format");
54    }
55  } elsif ($argVal eq "-of" || $argVal eq "--output-format") {
56    $outFormat = shift;
57    usage ("Must specify input format") if ! $outFormat;
58    if ($outFormat =~ /^srt/i) {
59      $outFormat = $FMT_SRT;
60    } else {
61      usage ("Invalid output format");
62    }
63  } elsif ($argVal eq "-s" || $argVal eq "--shift") {
64    my $argTime = shift;
65    if (! defined $argTime ||
66	! defined ($shiftMilli = getTimeMillis ($argTime))) {
67      usage ("Invalid shift time value");
68    }
69  } elsif ($argVal eq "-c" || $argVal eq "--scale") {
70    my $argTime = shift;
71    if (! defined $argTime ||
72	! defined ($scaleMilli = getTimeMillis ($argTime))) {
73      usage ("Invalid scale time value");
74    }
75  } elsif ($argVal eq "-f" || $argVal eq "--split-from") {
76    my $argTime = shift;
77    if (! defined $argTime ||
78	! defined ($splitFromMilli = getTimeMillis ($argTime))) {
79      usage ("Invalid split start time value");
80    }
81  } elsif ($argVal eq "-t" || $argVal eq "--split-to") {
82    my $argTime = shift;
83    if (! defined $argTime ||
84	! defined ($splitToMilli = getTimeMillis ($argTime))) {
85      usage ("Invalid split end time value");
86    }
87  } elsif ($argVal eq "-h" || $argVal eq "--help") {
88    usage ();
89  } else {
90    usage ("Unrecognized argument $argVal");
91  }
92}
93
94# Input format defaults to SRT
95$inFormat = $FMT_SRT if (! defined $inFormat);
96# Output format defaults to the same as input
97$outFormat = $inFormat if (! defined $outFormat);
98
99## Read
100
101my $subs;
102if ($inFormat == $FMT_SRT) {
103  $subs = readSRT (*STDIN);
104  printf STDERR ("Read %d SRT subs\n", scalar @{$subs}) if $DEBUG;
105  # Sort by start time
106  @{$subs} = sort {$a -> {srtStartTime} <=> $b -> {srtEndTime}} @{$subs};
107}
108
109## Transform
110
111if (defined $shiftMilli && 0 != $shiftMilli) {
112  printf STDERR ("Shift: %d milliseconds\n", $shiftMilli) if $DEBUG;
113  shiftSRT ($subs, $shiftMilli);
114}
115
116if (defined $splitFromMilli || defined $splitToMilli) {
117  if ($DEBUG) {
118    my $printFrom = (defined $splitFromMilli) ? $splitFromMilli : "-";
119    my $printTo = (defined $splitToMilli) ? $splitToMilli : "-";
120    printf STDERR ("Split: from $printFrom to $printTo\n");
121  }
122  splitSRT ($subs, $splitFromMilli, $splitToMilli);
123}
124
125if (defined $scaleMilli && 0 != $scaleMilli) {
126  my $lastSubIdx = scalar @{$subs} - 1;
127  if ($lastSubIdx >= 0) {
128    my $lastTimeOrig = $subs -> [$lastSubIdx] -> {srtEndTime};
129    if ($lastTimeOrig == 0) {
130      die "Cannot scale when last subtitle ends at 00:00:00,000";
131    }
132    my $lastTimeScaled = $lastTimeOrig + $scaleMilli;
133    printf STDERR ("Scale: %d/%d\n", $lastTimeScaled, $lastTimeOrig) if $DEBUG;
134    scaleSRT ($subs, $lastTimeScaled, $lastTimeOrig);
135  }
136}
137
138## Write
139if ($outFormat == $FMT_SRT) {
140  writeSRT (*STDOUT, $subs);
141}
142
143# Close STDOUT, as recommended by Perl manual
144# (allows diagnostics on disc overflow, etc.)
145close (STDOUT) || die "Cannot close output stream: $!";
146
147exit 0;
148
149## Subroutines
150
151# Convert string time format to milliseconds
152# SRT style: "01:20:03.251", and "," is allowed instead of "."
153# Return undef in case of format error
154sub getTimeMillis
155{
156  $_ = shift;
157  my $millis = 0;
158
159  if (/\s*(.*)[\.,]([0-9]+)?\s*$/) { # Fraction; strip surrounding spaces
160    #print STDERR "frac: \$1=$1 \$2=$2\n" if $DEBUG;
161    $_ = $1;
162    $millis += ("0." . $2) * 1000 if $2;
163  }
164  if (/(.*?)([0-9]+)$/) { # Seconds
165    #print STDERR "secs: \$1=$1 \$2=$2\n" if $DEBUG;
166    $_ = $1;
167    $millis += $2 * 1000 if $2;
168  }
169  if (/(.*?)([0-9]+):$/) { # Minutes
170    #print STDERR "mins: \$1=$1 \$2=$2\n" if $DEBUG;
171    $_ = $1;
172    $millis += $2 * 60000 if $2;
173  }
174  if (/(.*?)([0-9]+):$/) { # Hours
175    #print STDERR "mins: \$1=$1 \$2=$2\n" if $DEBUG;
176    $_ = $1;
177    $millis += $2 * 3600000 if $2;
178  }
179  if (/(.*?)\-$/) { # Minus sign
180    $_ = $1;
181    $millis *= -1;
182  }
183  $millis = undef if (! /^$/); # Make sure we ate everything up
184  if ($DEBUG) {
185    if (defined $millis) {
186      #print STDERR "time value match: $millis ms\n";
187    } else {
188      #print STDERR "time mismatch\n";
189    }
190  }
191  return $millis;
192}
193
194# Convert milliseconds to SRT formatted string
195sub getTimeSRT
196{
197  my $t = shift;
198  my $tMinus = "";
199  if ($t < 0) {
200    $t = -$t;
201    $tMinus = "-";
202  }
203  my $tMilli = $t % 1000;
204  $t /= 1000;
205  my $tSec = $t % 60;
206  $t /= 60;
207  my $tMin = $t % 60;
208  $t /= 60;
209  my $tHr = $t;
210  return sprintf ("%s%02d:%02d:%02d,%03d",
211		  $tMinus, $tHr, $tMin, $tSec, $tMilli);
212}
213
214# Read SRT subtitles
215sub readSRT
216{
217  local *IN = shift;
218  my $subs = [];
219
220  $_ = <IN>;
221  print STDERR "Undefined first line\n" if ! defined $_ && $DEBUG;
222  my $lineNo = 1;
223  READ_SUBS:
224  while (defined $_) {
225    # Each loop iteration reads one subtitle from <IN>
226    my $sub = {};
227
228    # print STDERR "Reading line $lineNo\n" if $DEBUG;
229
230    # Skip empty lines
231    while (/^\s*$/) {
232      last READ_SUBS if ! ($_ = <IN>);
233      ++$lineNo;
234    }
235
236    # Subtitle number
237    if (/^\s*([0-9]+)\s*$/) {
238      $sub -> {srtNumber} = $1;
239      # print "SRT num: $1\n" if $DEBUG;
240    } else {
241      die "Invalid SRT format at line $lineNo";
242    }
243
244    # Timing
245    if ($_ = <IN>) {
246      ++$lineNo;
247    } else {
248      die "Unexpected end of SRT stream at line $lineNo";
249    }
250    # print STDERR "LINE: $_\n" if $DEBUG;
251    if (/^\s*(\S+)\s*--\>\s*(\S+)\s*$/) {
252      my $startMillis = getTimeMillis ($1);
253      my $endMillis = getTimeMillis ($2);
254      die "Invalid SRT timing format at line $lineNo: $_"
255	if ! defined $startMillis || ! defined $endMillis;
256      $sub -> {srtStartTime} = $startMillis;
257      $sub -> {srtEndTime} = $endMillis;
258    } else {
259      die "Invalid SRT timing format at line $lineNo: $_";
260    }
261
262    # Text lines
263    my $subLines = [];
264    while (1) {
265      last if ! ($_ = <IN>); # EOF ends subtitle
266      ++$lineNo;
267      last if /^\s*$/; # Empty line ends subtitle
268      ($_ = $_) =~ s/\s+$//; # Strip trailing spaces
269      push @{$subLines}, $_;
270    }
271    die "No text in SRT subtitle at line $lineNo" if 0 == scalar @{$subLines};
272    $sub -> {lines} = $subLines;
273
274    # Append subtitle to the list
275    push @{$subs}, $sub;
276  }
277  print STDERR "SRT read ok, $lineNo lines\n" if $DEBUG;
278
279  return $subs;
280}
281
282# Write SRT subtitles
283sub writeSRT
284{
285  use integer; # For integer division
286  local *OUT = shift;
287  my $subs = shift;
288
289  my $subNum = 0;
290  foreach (@{$subs}) {
291    ++$subNum;
292
293    my $sub = $_;
294    my $sTimeSRT = getTimeSRT ($sub -> {srtStartTime});
295    my $eTimeSRT = getTimeSRT ($sub -> {srtEndTime});
296    printf OUT ("%d\n%s --> %s\n", $subNum, $sTimeSRT, $eTimeSRT);
297    foreach (@{$sub -> {lines}}) {
298      printf OUT ("%s\n", $_);
299    }
300    printf OUT "\n";
301  }
302  printf STDERR ("Wrote %d SRT subs\n", $subNum) if $DEBUG;
303}
304
305# Shift SRT subtitles by a given number of seconds.
306# The number may be negative and fractional.
307sub shiftSRT
308{
309  use integer; # $shiftMilli could be passed as float
310  my $subs = shift;
311  my $shiftMilli = shift;
312
313  foreach (@{$subs}) {
314    $_ -> {srtStartTime} += $shiftMilli;
315    $_ -> {srtEndTime} += $shiftMilli;
316  }
317}
318
319# Multiply each subtitle timing by a divident and divide by divisor.
320# The idea is that the divident is usually the new total number of
321# milliseconds in the subtitle file, and the divisor is the old
322# total number of milliseconds in the subtitle file.
323# We could simply use a double precision real coefficient instead of
324# integer divident and divisor, and that could be good enough, but
325# using integer arithmetics *guarantees* precision up to the last
326# digit, so why settle for good enough when we can have a guarantee.
327#
328# Uses Math::BigInt arithmetics, because it works with numbers
329# up to (total number of milliseconds for a subtitle timing)^2,
330# which could be on the order of approximately 1e+13, which is
331# larger than maximum 32-bit integer.
332# There is a performance loss when using BigInt vs. regular floating
333# point arithmetics, but the actual performance is quite acceptable
334# on files with a few thousand subtitles.
335sub scaleSRT
336{
337  use integer; # Divident and divisor could be passed as floats, truncate
338  my $subs = shift;
339  my $scaleDividend = shift;
340  my $scaleDivisor = shift;
341
342  foreach (@{$subs}) {
343    my $ss = Math::BigInt -> new ($_ -> {srtStartTime});
344    $ss = $ss -> bmul ($scaleDividend);
345    $_ -> {srtStartTime} = $ss -> bdiv ($scaleDivisor) -> bsstr ();
346    my $se = Math::BigInt -> new ($_ -> {srtEndTime});
347    $se = $se -> bmul ($scaleDividend);
348    $_ -> {srtEndTime} = $se -> bdiv ($scaleDivisor) -> bsstr ();
349  }
350}
351
352# Extract a fragment within a given time interval
353# Either "from" or "to" may be undefined
354sub splitSRT
355{
356  use integer; # fromMilli and toMilli could be passed as floats, truncate
357  my $subs = shift;
358  my $fromMilli = shift;
359  my $toMilli = shift;
360
361  my $iSub = 0;
362  while ($iSub < scalar @{$subs}) {
363    $_ = $subs -> [$iSub];
364    my $keep = 0;
365    if (! defined $fromMilli || $_ -> {srtEndTime} >= $fromMilli) {
366      # The subtitle ends later than the start boundary
367
368      # Fix overlapping start timing,
369      # but only of the start boundary is not infinite (undef)
370      if (defined $fromMilli && $_ -> {srtStartTime} < $fromMilli) {
371	$_ -> {srtStartTime} = $fromMilli;
372      }
373      if (! defined $toMilli || $_ -> {srtStartTime} <= $toMilli) {
374	# The subtitle begins earlier than the end boundary
375
376	# Fix overlapping end timing,
377	# but only of the end boundary is not infinite (undef)
378	if (defined $toMilli && $_ -> {srtEndTime} > $toMilli) {
379	  $_ -> {srtEndTime} = $toMilli;
380	}
381
382	# All conditions met, all fixes done
383	$keep = 1;
384      }
385    }
386    if ($keep) {
387      ++$iSub;
388    } else {
389      splice @{$subs}, $iSub, 1;
390    }
391  }
392}
393
394# Print brief usage help
395# Accepts an optional error message, e.g. for errors parsing command line
396sub usage
397{
398  my $msg = shift;
399  my $exitCode = 0;
400
401  if (defined $msg) {
402    $exitCode = 2;
403    print STDERR "$msg\n";
404  }
405
406  print STDERR <<USAGE;
407Usage: $0 [switches]
408  -if,--input-format <fmt>  input format; supported: SRT
409                            default is SRT
410  -of,--output-format <fmt> output format; supported: SRT
411                            default is same as input format
412  -s,--shift <time>         shift all subtitles by <time>
413                            (format: [-]hh:mm:ss,fraction)
414  -c,--scale <time>         scale by adding <time> to overall duration
415  -f,--split-from <time>    Drop subtitles that end before <time>
416  -t,--split-to <time>      Drop subtitles that start after <time>
417                            (will truncate timing if it overlaps a boundary)
418  -r,--renumber             renumber SRT subtitles in output
419  -d,--debug                enable debug output
420  -h,--help                 this help message
421
422All times could be negative. Input/output may also contain negative timings,
423which is sometimes useful for intermediate results.
424SRT subtitles are always renumbered on output.
425
426EXAMPLES
427
428Split subtitle file into two disks at a boundary of one hour 15 minutes:
429
430  subedit.pl --split-to 1:15:0 < all.srt > p1.srt
431  subedit.pl -f 1:15:0 < all.srt | subedit.pl --shift -1:15:0 > p2.srt
432
433Join the previous two disks back into one file:
434
435  subedit.pl -s 1:15:00 < p2.srt | cat p1.srt - | subedit.pl > all.srt
436
437Correct a situation where the first subtitle starts in sync with the video,
438but the last one starts 3.5 seconds earlier than the speech in the video,
439assuming the first subtitle timing is 00:01:05.030:
440
441  subedit.pl -s -1:5.03 | subedit.pl -c 3.5 | subedit.pl -s 1:5.03
442USAGE
443
444  exit $exitCode;
445}
446