1#!/usr/bin/env python3
2#
3# Takes a list of files on the command line and checks for valid
4# UTF-8 data. Used for checking .po files.
5#
6# Copyright © 2016 Dr. Tobias Quathamer <toddy@debian.org>
7#
8# This program is free software; you can redistribute it and/or
9# modify it under the terms of the GNU Lesser General Public
10# License as published by the Free Software Foundation; either
11# version 2.1 of the License, or (at your option) any later version.
12#
13# This program is distributed in the hope that it will be useful,
14# but WITHOUT ANY WARRANTY; without even the implied warranty of
15# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16# Lesser General Public License for more details.
17#
18# You should have received a copy of the GNU Lesser General Public
19# License along with this program; if not, write to the Free Software
20# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301  USA
21
22import re
23import sys
24
25# Remove the script name from the files to check
26sys.argv.pop(0)
27
28# Assume that every file is valid
29exit_status = 0
30
31# Cycle through all files and check for valid UTF-8 encoding
32for filename in sys.argv:
33    # Open the file for reading in binary mode
34    with open(filename, "rb") as pofile:
35        # The "Content-Type" header has not been seen yet
36        charset_utf8_seen = False
37        # Read all lines to check for Content-Type header
38        for line in pofile:
39            # Try to decode binary data to UTF-8
40            try:
41                utf8 = line.decode(encoding="utf-8", errors="strict")
42            except UnicodeError as error:
43                print("UTF-8 encoding error in file %s: %s (position %d)" % (filename, error.reason, error.start))
44                print("Binary data: %s" % line)
45                exit_status = 1
46                break
47            if re.search(r'Content-Type: text/plain; charset=UTF-8', utf8):
48                charset_utf8_seen = True
49        # The whole file has been read, the content type should have
50        # been detected now. Otherwise, it's an error.
51        if not charset_utf8_seen:
52            print("Error in file %s: could not detect Content-Type header" % filename)
53            exit_status = 1
54            break
55
56sys.exit(exit_status)
57