File size: 2,002 Bytes
f9d7028
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
#!/usr/bin/env perl
#
# This file is part of moses.  Its use is licensed under the GNU Lesser General
# Public License version 2.1 or, at your option, any later version.

use warnings;
use strict;

my $language = "en";
my $PENN = 0;

while (@ARGV) {
    $_ = shift;
    /^-b$/ && ($| = 1, next); # not buffered (flush each line)
    /^-l$/ && ($language = shift, next);
    /^[^\-]/ && ($language = $_, next);
  	/^-penn$/ && ($PENN = 1, next);
}

while(<STDIN>) {
    s/\r//g;
    # remove extra spaces
    s/\(/ \(/g;
    s/\)/\) /g; s/ +/ /g;
    s/\) ([\.\!\:\?\;\,])/\)$1/g;
    s/\( /\(/g;
    s/ \)/\)/g;
    s/(\d) \%/$1\%/g;
    s/ :/:/g;
    s/ ;/;/g;
    # normalize unicode punctuation
    if ($PENN == 0) {
      s/\`/\'/g;
      s/\'\'/ \" /g;
    }

    s/„/\"/g;

    s/“/\"/g;

    s/”/\"/g;

    s/–/-/g;

    s/—/ - /g; s/ +/ /g;

    s/´/\'/g;

    s/([a-z])‘([a-z])/$1\'$2/gi;

    s/([a-z])’([a-z])/$1\'$2/gi;

    s/‘/\'/g;

    s/‚/\'/g;

    s/’/\"/g;

    s/''/\"/g;

    s/´´/\"/g;

    s/…/.../g;

    # French quotes

    s/ « / \"/g;

    s/« /\"/g;

    s/«/\"/g;

    s/ » /\" /g;

    s/ »/\"/g;

    s/»/\"/g;

    # handle pseudo-spaces

    s/ \%/\%/g;

    s/nº /nº /g;

    s/ :/:/g;

    s/ ºC/ ºC/g;

    s/ cm/ cm/g;

    s/ \?/\?/g;

    s/ \!/\!/g;

    s/ ;/;/g;

    s/, /, /g; s/ +/ /g;



    # English "quotation," followed by comma, style

    if ($language eq "en") {

	s/\"([,\.]+)/$1\"/g;

    }

    # Czech is confused

    elsif ($language eq "cs" || $language eq "cz") {

    }

    # German/Spanish/French "quotation", followed by comma, style

    else {

	s/,\"/\",/g;	

	s/(\.+)\"(\s*[^<])/\"$1$2/g; # don't fix period at end of sentence

    }





    if ($language eq "de" || $language eq "es" || $language eq "cz" || $language eq "cs" || $language eq "fr") {

	s/(\d) (\d)/$1,$2/g;

    }

    else {

	s/(\d) (\d)/$1.$2/g;

    }

    print $_;

}