1#!/usr/local/bin/perl -w 2 3# A script for pipelined editing of subtitle files. 4# Copyright (C) 2004 Michael Klepikov <mike72@mail.ru> 5# 6# Version 1.0 initial release 28-Mar-04 7# 8# Comments, suggestions -- send me an mail, but the recommended way is 9# to enhance/fix on your own and submit to the distribution;) 10# If you like, I can review the fixes. 11# 12# This script is free software; you can redistribute it and/or 13# modify it under the terms of the GNU Lesser General Public 14# License as published by the Free Software Foundation; either 15# version 2.1 of the License, or (at your option) any later version. 16# Retain original credits when modifying. 17# 18# This script is distributed in the hope that it will be useful, 19# but WITHOUT ANY WARRANTY; without even the implied warranty of 20# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 21# Lesser General Public License for more details. 22# 23# You should have received a copy of the GNU Lesser General Public 24# License along with this library; if not, write to the Free Software 25# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 26# 27 28use Math::BigInt; 29 30# Constants 31my $FMT_UNKNOWN = 0; 32my $FMT_SRT = 1; 33 34# Argument values 35my $DEBUG = 0; 36my $inFormat; 37my $outFormat; 38my $shiftMilli; 39my $scaleMilli; 40my $splitFromMilli; 41my $splitToMilli; 42 43## Process command line 44while (defined ($argVal = shift)) { 45 if ($argVal eq "-d" || $argVal eq "--debug") { 46 $DEBUG = 1; 47 } elsif ($argVal eq "-if" || $argVal eq "--input-format") { 48 $inFormat = shift; 49 usage ("Must specify input format") if ! $inFormat; 50 if ($inFormat =~ /^srt/i) { 51 $inFormat = $FMT_SRT; 52 } else { 53 usage ("Invalid input format"); 54 } 55 } elsif ($argVal eq "-of" || $argVal eq "--output-format") { 56 $outFormat = shift; 57 usage ("Must specify input format") if ! $outFormat; 58 if ($outFormat =~ /^srt/i) { 59 $outFormat = $FMT_SRT; 60 } else { 61 usage ("Invalid output format"); 62 } 63 } elsif ($argVal eq "-s" || $argVal eq "--shift") { 64 my $argTime = shift; 65 if (! defined $argTime || 66 ! defined ($shiftMilli = getTimeMillis ($argTime))) { 67 usage ("Invalid shift time value"); 68 } 69 } elsif ($argVal eq "-c" || $argVal eq "--scale") { 70 my $argTime = shift; 71 if (! defined $argTime || 72 ! defined ($scaleMilli = getTimeMillis ($argTime))) { 73 usage ("Invalid scale time value"); 74 } 75 } elsif ($argVal eq "-f" || $argVal eq "--split-from") { 76 my $argTime = shift; 77 if (! defined $argTime || 78 ! defined ($splitFromMilli = getTimeMillis ($argTime))) { 79 usage ("Invalid split start time value"); 80 } 81 } elsif ($argVal eq "-t" || $argVal eq "--split-to") { 82 my $argTime = shift; 83 if (! defined $argTime || 84 ! defined ($splitToMilli = getTimeMillis ($argTime))) { 85 usage ("Invalid split end time value"); 86 } 87 } elsif ($argVal eq "-h" || $argVal eq "--help") { 88 usage (); 89 } else { 90 usage ("Unrecognized argument $argVal"); 91 } 92} 93 94# Input format defaults to SRT 95$inFormat = $FMT_SRT if (! defined $inFormat); 96# Output format defaults to the same as input 97$outFormat = $inFormat if (! defined $outFormat); 98 99## Read 100 101my $subs; 102if ($inFormat == $FMT_SRT) { 103 $subs = readSRT (*STDIN); 104 printf STDERR ("Read %d SRT subs\n", scalar @{$subs}) if $DEBUG; 105 # Sort by start time 106 @{$subs} = sort {$a -> {srtStartTime} <=> $b -> {srtEndTime}} @{$subs}; 107} 108 109## Transform 110 111if (defined $shiftMilli && 0 != $shiftMilli) { 112 printf STDERR ("Shift: %d milliseconds\n", $shiftMilli) if $DEBUG; 113 shiftSRT ($subs, $shiftMilli); 114} 115 116if (defined $splitFromMilli || defined $splitToMilli) { 117 if ($DEBUG) { 118 my $printFrom = (defined $splitFromMilli) ? $splitFromMilli : "-"; 119 my $printTo = (defined $splitToMilli) ? $splitToMilli : "-"; 120 printf STDERR ("Split: from $printFrom to $printTo\n"); 121 } 122 splitSRT ($subs, $splitFromMilli, $splitToMilli); 123} 124 125if (defined $scaleMilli && 0 != $scaleMilli) { 126 my $lastSubIdx = scalar @{$subs} - 1; 127 if ($lastSubIdx >= 0) { 128 my $lastTimeOrig = $subs -> [$lastSubIdx] -> {srtEndTime}; 129 if ($lastTimeOrig == 0) { 130 die "Cannot scale when last subtitle ends at 00:00:00,000"; 131 } 132 my $lastTimeScaled = $lastTimeOrig + $scaleMilli; 133 printf STDERR ("Scale: %d/%d\n", $lastTimeScaled, $lastTimeOrig) if $DEBUG; 134 scaleSRT ($subs, $lastTimeScaled, $lastTimeOrig); 135 } 136} 137 138## Write 139if ($outFormat == $FMT_SRT) { 140 writeSRT (*STDOUT, $subs); 141} 142 143# Close STDOUT, as recommended by Perl manual 144# (allows diagnostics on disc overflow, etc.) 145close (STDOUT) || die "Cannot close output stream: $!"; 146 147exit 0; 148 149## Subroutines 150 151# Convert string time format to milliseconds 152# SRT style: "01:20:03.251", and "," is allowed instead of "." 153# Return undef in case of format error 154sub getTimeMillis 155{ 156 $_ = shift; 157 my $millis = 0; 158 159 if (/\s*(.*)[\.,]([0-9]+)?\s*$/) { # Fraction; strip surrounding spaces 160 #print STDERR "frac: \$1=$1 \$2=$2\n" if $DEBUG; 161 $_ = $1; 162 $millis += ("0." . $2) * 1000 if $2; 163 } 164 if (/(.*?)([0-9]+)$/) { # Seconds 165 #print STDERR "secs: \$1=$1 \$2=$2\n" if $DEBUG; 166 $_ = $1; 167 $millis += $2 * 1000 if $2; 168 } 169 if (/(.*?)([0-9]+):$/) { # Minutes 170 #print STDERR "mins: \$1=$1 \$2=$2\n" if $DEBUG; 171 $_ = $1; 172 $millis += $2 * 60000 if $2; 173 } 174 if (/(.*?)([0-9]+):$/) { # Hours 175 #print STDERR "mins: \$1=$1 \$2=$2\n" if $DEBUG; 176 $_ = $1; 177 $millis += $2 * 3600000 if $2; 178 } 179 if (/(.*?)\-$/) { # Minus sign 180 $_ = $1; 181 $millis *= -1; 182 } 183 $millis = undef if (! /^$/); # Make sure we ate everything up 184 if ($DEBUG) { 185 if (defined $millis) { 186 #print STDERR "time value match: $millis ms\n"; 187 } else { 188 #print STDERR "time mismatch\n"; 189 } 190 } 191 return $millis; 192} 193 194# Convert milliseconds to SRT formatted string 195sub getTimeSRT 196{ 197 my $t = shift; 198 my $tMinus = ""; 199 if ($t < 0) { 200 $t = -$t; 201 $tMinus = "-"; 202 } 203 my $tMilli = $t % 1000; 204 $t /= 1000; 205 my $tSec = $t % 60; 206 $t /= 60; 207 my $tMin = $t % 60; 208 $t /= 60; 209 my $tHr = $t; 210 return sprintf ("%s%02d:%02d:%02d,%03d", 211 $tMinus, $tHr, $tMin, $tSec, $tMilli); 212} 213 214# Read SRT subtitles 215sub readSRT 216{ 217 local *IN = shift; 218 my $subs = []; 219 220 $_ = <IN>; 221 print STDERR "Undefined first line\n" if ! defined $_ && $DEBUG; 222 my $lineNo = 1; 223 READ_SUBS: 224 while (defined $_) { 225 # Each loop iteration reads one subtitle from <IN> 226 my $sub = {}; 227 228 # print STDERR "Reading line $lineNo\n" if $DEBUG; 229 230 # Skip empty lines 231 while (/^\s*$/) { 232 last READ_SUBS if ! ($_ = <IN>); 233 ++$lineNo; 234 } 235 236 # Subtitle number 237 if (/^\s*([0-9]+)\s*$/) { 238 $sub -> {srtNumber} = $1; 239 # print "SRT num: $1\n" if $DEBUG; 240 } else { 241 die "Invalid SRT format at line $lineNo"; 242 } 243 244 # Timing 245 if ($_ = <IN>) { 246 ++$lineNo; 247 } else { 248 die "Unexpected end of SRT stream at line $lineNo"; 249 } 250 # print STDERR "LINE: $_\n" if $DEBUG; 251 if (/^\s*(\S+)\s*--\>\s*(\S+)\s*$/) { 252 my $startMillis = getTimeMillis ($1); 253 my $endMillis = getTimeMillis ($2); 254 die "Invalid SRT timing format at line $lineNo: $_" 255 if ! defined $startMillis || ! defined $endMillis; 256 $sub -> {srtStartTime} = $startMillis; 257 $sub -> {srtEndTime} = $endMillis; 258 } else { 259 die "Invalid SRT timing format at line $lineNo: $_"; 260 } 261 262 # Text lines 263 my $subLines = []; 264 while (1) { 265 last if ! ($_ = <IN>); # EOF ends subtitle 266 ++$lineNo; 267 last if /^\s*$/; # Empty line ends subtitle 268 ($_ = $_) =~ s/\s+$//; # Strip trailing spaces 269 push @{$subLines}, $_; 270 } 271 die "No text in SRT subtitle at line $lineNo" if 0 == scalar @{$subLines}; 272 $sub -> {lines} = $subLines; 273 274 # Append subtitle to the list 275 push @{$subs}, $sub; 276 } 277 print STDERR "SRT read ok, $lineNo lines\n" if $DEBUG; 278 279 return $subs; 280} 281 282# Write SRT subtitles 283sub writeSRT 284{ 285 use integer; # For integer division 286 local *OUT = shift; 287 my $subs = shift; 288 289 my $subNum = 0; 290 foreach (@{$subs}) { 291 ++$subNum; 292 293 my $sub = $_; 294 my $sTimeSRT = getTimeSRT ($sub -> {srtStartTime}); 295 my $eTimeSRT = getTimeSRT ($sub -> {srtEndTime}); 296 printf OUT ("%d\n%s --> %s\n", $subNum, $sTimeSRT, $eTimeSRT); 297 foreach (@{$sub -> {lines}}) { 298 printf OUT ("%s\n", $_); 299 } 300 printf OUT "\n"; 301 } 302 printf STDERR ("Wrote %d SRT subs\n", $subNum) if $DEBUG; 303} 304 305# Shift SRT subtitles by a given number of seconds. 306# The number may be negative and fractional. 307sub shiftSRT 308{ 309 use integer; # $shiftMilli could be passed as float 310 my $subs = shift; 311 my $shiftMilli = shift; 312 313 foreach (@{$subs}) { 314 $_ -> {srtStartTime} += $shiftMilli; 315 $_ -> {srtEndTime} += $shiftMilli; 316 } 317} 318 319# Multiply each subtitle timing by a divident and divide by divisor. 320# The idea is that the divident is usually the new total number of 321# milliseconds in the subtitle file, and the divisor is the old 322# total number of milliseconds in the subtitle file. 323# We could simply use a double precision real coefficient instead of 324# integer divident and divisor, and that could be good enough, but 325# using integer arithmetics *guarantees* precision up to the last 326# digit, so why settle for good enough when we can have a guarantee. 327# 328# Uses Math::BigInt arithmetics, because it works with numbers 329# up to (total number of milliseconds for a subtitle timing)^2, 330# which could be on the order of approximately 1e+13, which is 331# larger than maximum 32-bit integer. 332# There is a performance loss when using BigInt vs. regular floating 333# point arithmetics, but the actual performance is quite acceptable 334# on files with a few thousand subtitles. 335sub scaleSRT 336{ 337 use integer; # Divident and divisor could be passed as floats, truncate 338 my $subs = shift; 339 my $scaleDividend = shift; 340 my $scaleDivisor = shift; 341 342 foreach (@{$subs}) { 343 my $ss = Math::BigInt -> new ($_ -> {srtStartTime}); 344 $ss = $ss -> bmul ($scaleDividend); 345 $_ -> {srtStartTime} = $ss -> bdiv ($scaleDivisor) -> bsstr (); 346 my $se = Math::BigInt -> new ($_ -> {srtEndTime}); 347 $se = $se -> bmul ($scaleDividend); 348 $_ -> {srtEndTime} = $se -> bdiv ($scaleDivisor) -> bsstr (); 349 } 350} 351 352# Extract a fragment within a given time interval 353# Either "from" or "to" may be undefined 354sub splitSRT 355{ 356 use integer; # fromMilli and toMilli could be passed as floats, truncate 357 my $subs = shift; 358 my $fromMilli = shift; 359 my $toMilli = shift; 360 361 my $iSub = 0; 362 while ($iSub < scalar @{$subs}) { 363 $_ = $subs -> [$iSub]; 364 my $keep = 0; 365 if (! defined $fromMilli || $_ -> {srtEndTime} >= $fromMilli) { 366 # The subtitle ends later than the start boundary 367 368 # Fix overlapping start timing, 369 # but only of the start boundary is not infinite (undef) 370 if (defined $fromMilli && $_ -> {srtStartTime} < $fromMilli) { 371 $_ -> {srtStartTime} = $fromMilli; 372 } 373 if (! defined $toMilli || $_ -> {srtStartTime} <= $toMilli) { 374 # The subtitle begins earlier than the end boundary 375 376 # Fix overlapping end timing, 377 # but only of the end boundary is not infinite (undef) 378 if (defined $toMilli && $_ -> {srtEndTime} > $toMilli) { 379 $_ -> {srtEndTime} = $toMilli; 380 } 381 382 # All conditions met, all fixes done 383 $keep = 1; 384 } 385 } 386 if ($keep) { 387 ++$iSub; 388 } else { 389 splice @{$subs}, $iSub, 1; 390 } 391 } 392} 393 394# Print brief usage help 395# Accepts an optional error message, e.g. for errors parsing command line 396sub usage 397{ 398 my $msg = shift; 399 my $exitCode = 0; 400 401 if (defined $msg) { 402 $exitCode = 2; 403 print STDERR "$msg\n"; 404 } 405 406 print STDERR <<USAGE; 407Usage: $0 [switches] 408 -if,--input-format <fmt> input format; supported: SRT 409 default is SRT 410 -of,--output-format <fmt> output format; supported: SRT 411 default is same as input format 412 -s,--shift <time> shift all subtitles by <time> 413 (format: [-]hh:mm:ss,fraction) 414 -c,--scale <time> scale by adding <time> to overall duration 415 -f,--split-from <time> Drop subtitles that end before <time> 416 -t,--split-to <time> Drop subtitles that start after <time> 417 (will truncate timing if it overlaps a boundary) 418 -r,--renumber renumber SRT subtitles in output 419 -d,--debug enable debug output 420 -h,--help this help message 421 422All times could be negative. Input/output may also contain negative timings, 423which is sometimes useful for intermediate results. 424SRT subtitles are always renumbered on output. 425 426EXAMPLES 427 428Split subtitle file into two disks at a boundary of one hour 15 minutes: 429 430 subedit.pl --split-to 1:15:0 < all.srt > p1.srt 431 subedit.pl -f 1:15:0 < all.srt | subedit.pl --shift -1:15:0 > p2.srt 432 433Join the previous two disks back into one file: 434 435 subedit.pl -s 1:15:00 < p2.srt | cat p1.srt - | subedit.pl > all.srt 436 437Correct a situation where the first subtitle starts in sync with the video, 438but the last one starts 3.5 seconds earlier than the speech in the video, 439assuming the first subtitle timing is 00:01:05.030: 440 441 subedit.pl -s -1:5.03 | subedit.pl -c 3.5 | subedit.pl -s 1:5.03 442USAGE 443 444 exit $exitCode; 445} 446