basic-syntax.pl 8.46 KB
Newer Older
Zvi Devir's avatar
Zvi Devir committed
1 2 3
#!/usr/bin/perl

use warnings;
Zvi Devir's avatar
Zvi Devir committed
4
no if ($]>=5.018), warnings => 'experimental';
Zvi Devir's avatar
Zvi Devir committed
5 6 7
use strict;
no strict 'refs';
use utf8;
Zvi Devir's avatar
Zvi Devir committed
8 9
use English;
use Encode;
10
use Getopt::Long;
Zvi Devir's avatar
Zvi Devir committed
11
use IPC::Run 'run';
Zvi Devir's avatar
Zvi Devir committed
12 13 14 15 16

binmode STDIN, "utf8";
binmode STDOUT, "utf8";
binmode STDERR, "utf8";

Zvi Devir's avatar
Zvi Devir committed
17
my $raw = 1;
18
my $brackets = 1;
19 20
my $debug = 0;
my $verbose = 0;
Zvi Devir's avatar
Zvi Devir committed
21
my $clean = 1;
22 23

my ($t1, $t2);
24 25 26

GetOptions(
	"raw" => \$raw,
Zvi Devir's avatar
Zvi Devir committed
27
	"clean" => \$clean,
28
	"brackets" => sub { $brackets = 0; },
29
	"debug" => \$debug,
30 31
) or die("Error in command line arguments\n");

Zvi Devir's avatar
Zvi Devir committed
32 33
local $/;
$_ = <>;
Zvi Devir's avatar
Zvi Devir committed
34
$_ = cleanup($_) if $clean;
Zvi Devir's avatar
Zvi Devir committed
35

36 37 38
$verbose ||= $debug;
print STDERR "Input size is " . length($_) . "\n" if $verbose;
$debug &&= (length($_)<2000);
39

40 41
my $fix_sig = 'תי?קו(?:ן|ים)';
my $num_sig = '\d+(?:[^ ,.:;"\n\[\]()]+|(?:\.\d+)+|\([^ ,.:;"\n\[\]()]+\))*+';
Zvi Devir's avatar
Zvi Devir committed
42
my $sub_sig = '\(\d[^ ,.:;"\n\[\]()]*\)(?:\([^ ,.:;"\n\[\]()]+\))*+';
Zvi Devir's avatar
Zvi Devir committed
43 44
# my $chp_sig = '\d+(?:[^ ,.:;"\n\[\]()]{0,5}?\d*\.|(?:\.\d+)+)';
my $chp_sig = '\d+[^ ,.:;"\n\[\]()]{0,5}?\d*\.(?!\d)';
Zvi Devir's avatar
Zvi Devir committed
45
my $ext_sig = 'ה?(?:(ראשו[נן]ה?|שניי?ה?|שלישית?|רביעית?|חמישית?|שי?שית?|שביעית?|שמינית?|תשיעית?|עשירית?|אח[דת][ \-]עשרה?|ש[נת]יי?ם[ \-]עשרה?|שלושה?[ \-]עשרה?|ארבע[ \-]עשרה?|חמי?שה?[ \-]עשרה?|שי?שה?[ \-]עשרה?|שבעה?[ \-]עשרה?|[שמונה[ \-]עשרה?|תשעה?[ \-]עשרה?|עשרים|[א-יכל][\' ]|[טיכל]"[א-ט]|\d+[א-ת])(\d*))';
46
my $law_sig = 'ו?ש?[בהלמ]?(?:חוק|פקוד[הת]|תקנות|צו)\b';
Zvi Devir's avatar
Zvi Devir committed
47

Zvi Devir's avatar
Zvi Devir committed
48 49 50 51 52 53 54 55
# Check if we've got all parentheses wrong.
$t1 = () = (/([\(\[](תיקון|תיקונים):?)/g);
$t2 = () = (/([\)\]](תיקון|תיקונים):?)/g);
# print STDERR "got $t1 and $t2.\n";
if ($t1 < $t2) {
	tr/([{<>}])/)]}><{[(/;
}

56
print "################\n$_\n################\n" if ($debug);
Zvi Devir's avatar
Zvi Devir committed
57

58
# Section elements
Zvi Devir's avatar
Zvi Devir committed
59 60
s/^("?חלק ($num_sig|$ext_sig) *([:,-].*|))$/\n= $1 =\n/gm;
s/^("?(פרק|תוספת) ($num_sig|$ext_sig) *([:,-].*|))$/\n== $1 ==\n/gm;
Zvi Devir's avatar
Zvi Devir committed
61
s/^("?סימן ($num_sig|$ext_sig) *([:,-].*|))$/\n=== $1 ===\n/gm;
62
s/\n+(?=\=)/\n\n/g;
Zvi Devir's avatar
Zvi Devir committed
63 64
s/\n *(\((?:תיקון|תיקונים):? .*?\)) *(\n+=+ .*) (=+)\n/$2 $1 $3\n/g;
s/ *(=+)\n+ *(\((תיקון|תיקונים):? .*?\)) *\n/ $2 $1\n/g;
65

Zvi Devir's avatar
Zvi Devir committed
66 67 68
s/ *\(תיקון:? מס' \d+\) (תש.?".)-\d\d\d\d/\n(תיקון: $1)/g;
while (s/(\(תיקון: [^()\n]+)\)[\n ]\(תיקון: ([^()\n]+\))/$1, $2/g) {};

69 70
print "##AA############\n$_\n##AA############\n" if ($debug);

Zvi Devir's avatar
Zvi Devir committed
71
s/\n([^\n]+)\n(\d+\S{0,3}\. \([^\n\)]+\)) *\n/\n$2 $1\n/g;
Zvi Devir's avatar
Zvi Devir committed
72

73 74 75 76
# Join seperated lines
s/^([\d=\@:\-].*)$/$1 /gm; # Disallow concatination on certain prefixes
s/([א-ת\,A-Za-z])\n([א-תA-Za-z])/$1 $2/gm;
s/ +$//gm;
Zvi Devir's avatar
Zvi Devir committed
77
s/("[א-ת]-)\n(\d{4})/$1$2/gm;
78

Zvi Devir's avatar
Zvi Devir committed
79 80 81
s/(\n\d[\dא-ת]*) (.*)\n([^\.]+)\n(\d*\.) /$1$4 $2 $3\n: /g;
s/(\n\d[\dא-ת]*) (.*)\n(\d*\.) /$1$3 $2\n: /g;

82 83 84 85 86 87
s/\n([\(\[]מס' \d)/ $1/gm;
s/([\(\[](תיקון|תיקונים):?) *\n/$1 /gm;
s/\n(?=[\[\(](תיקון|תיקונים):?\b)/ /gm;

print "##BB############\n$_\n##BB############\n" if ($debug);

Zvi Devir's avatar
Zvi Devir committed
88 89 90
s/^($chp_sig) */$1 /gm;

# Check if chapter number is misplaced
Zvi Devir's avatar
Zvi Devir committed
91 92
$t1 = () = (/^.*[.;:\-] *\n{1,2}\d+\S{0,3}?\./gm);
$t2 = () = (/^.*[^.;:\-] *\n{1,2}\d+\S{0,3}?\./gm);
Zvi Devir's avatar
Zvi Devir committed
93 94
print STDERR "Got $t1 vs $t2.\n" if $verbose;
if ($t1<$t2 || /^[^.]+ (\((תיקון|תיקונים).*\)|\[(תיקון|תיקונים).*\])\n{1,2}\d+\./m || $debug) {
Zvi Devir's avatar
Zvi Devir committed
95
	s/^([^=.\n_]+)\n{1,2}($chp_sig)[ \n]+/@ $2 $1\n/gm;
Zvi Devir's avatar
Zvi Devir committed
96 97
}

Zvi Devir's avatar
Zvi Devir committed
98 99 100 101 102 103 104
# $t1 = () = (/^\d[^\.\n ]*\. .*?[:;.]\n.*?[^.;:\-\n]\n/gm);
# $t2 = () = (/^\d[^\.\n ]*\. .*?[^:;.\n]\n/gm);
# print STDERR "Got $t1 vs $t2.\n" if $verbose;
# 
# if ($t1>$t2) {
# 	s/^(\d[^\.\n ]*\.) (.*)\n/$2\n$1 /gm;
# }
105

106 107 108 109 110
# Should swap chapter title and numeral?
s/^("?\(\S{1,4}?\))\n("?\(\S{1,4}?\))/$1 $2/gm;
# s/^(.+)\n((\d\S*?\. *)?"?\(\S{1,4}?\)) *\n(?!\()/$2 $1\n/gm;
s/^(.+)\n{1,2}((\d\S*?\. *)?"?\(\S{1,4}?\)) *\n/$2 $1\n/gm;
s/^(.+[^".;:\n])\n("?\d\S*?\.)\n/$2 $1\n/gm;
Zvi Devir's avatar
Zvi Devir committed
111

112 113 114
if ($raw) {
	s/^(.+)\n(\d+|\*)\n/$2 $1\n/gm;
	# s/^(\d+[,;.]?)\n($law_sig.*)/$2 $1/gm;
Zvi Devir's avatar
Zvi Devir committed
115
	# s/^(\d+[,;.]?.*?)\n(.*?\d{4}( \[.*?\])?)$/$2 $1/gm;
116
}
117

118
print "##CC############\n$_\n##CC############\n" if ($debug);
119

120 121
# s/^(?:\n?@ *|)(\d\S*?\.)(?| (.*)|())$/"@ $1 " . fix_description($2)/gme;
# s/^(?:\n?@ *|)($chp_sig)( +(.*)|())$/@ $1 $2/gm;
122
s/^(?:\n?@ *|)($chp_sig) (.+)$/@ $1 $2/gm;
123
s/^([^.;:=\n_|]+?) *\n(@ $chp_sig)[ \n]+/\n$2 $1\n/gm;
124
s/^("?\([^)]{1,4}\))/: $1/gm;
125 126 127
s/^(@.*?)\n(?=[^:=@])/$1\n: /gm;
s/\n++(?=@)/\n\n/gs;
s/^@ ($chp_sig) (.*)$/"@ $1 " . fix_description($2)/gme;
Zvi Devir's avatar
Zvi Devir committed
128
s/^(=+ .* =+)$/fix_description($1)/gme;
129 130

print "##DD############\n$_\n##DD############\n" if ($debug);
Zvi Devir's avatar
Zvi Devir committed
131

132
if ($raw) {
133 134 135 136 137 138
	while (s/^([א-ת].{5,20}?[^"=.;\n)_ ]) *\n\n(@ \d.*?\.) /\n\n$2 $1 /gm) {}
}

$t1 = () = (/^.*[.;:\-] *\n[^\(:\n].*\n: \([א-ת]\) \(\d\)/gm);
$t2 = () = (/^.*[^.;:\-] *\n[^\(:\n].*\n: \([א-ת]\) \(\d\)/gm);
print STDERR "Got $t1 vs $t2.\n" if $verbose;
Zvi Devir's avatar
Zvi Devir committed
139

140 141 142 143 144 145 146 147
if ($t1>$t2) {
	s/^([^\(:\n].*)\n(: \([א-ת]\S{0,2}?\)) (\(\d\S{0,2}?\))/$2 $1\n: $3/gm;
	# s/([^\d=\@:\-\n].*?[א-ת\-\,])\n([א-ת])/$1 $2/gm;
	s/^([\d=\@:\-].*)$/$1 /gm;
	s/([א-ת\,A-Za-z])\n([א-תA-Za-z])/$1 $2/gm;
	s/ +$//gm;
	
	# s/([א-ת\-\,])\n([א-ת])/$1 $2/gm;
Zvi Devir's avatar
Zvi Devir committed
148
	# s/^([א-ת].*?[^.;:\-\n])\n(:( \(\S{1,3}?\))+)\n?/$2 $1\n/gm;
149
	# s/^([^\(:].*)\n(: \(.{1,3}?\)) (\(.{1,3}?\))/$2 $1\n: $3/gm;
150
}
151

Zvi Devir's avatar
Zvi Devir committed
152
s/([א-ת\,A-Za-z])\n([א-תA-Za-z])/$1 $2/gm;
153 154 155 156 157

print "##EE############\n$_\n##EE############\n" if ($debug);

# print $_; exit;

Zvi Devir's avatar
Zvi Devir committed
158
s/ \((נמחקה?|בו?טלה?|פקעה?)(?|\)([.;])|([.;])\))\n/ ((($1)$2))\n/gm;
159 160
# s/^(:+) *(\([^)\n]*\)[.;])$/$1 (($2))/gm;
# s/^(:+ \(\S+?\)) (\([^)\n]+\)[.;])$/$1 (($2))/gm;
Zvi Devir's avatar
Zvi Devir committed
161
s/ {2,}/ /g;
162

Zvi Devir's avatar
Zvi Devir committed
163

164
if ($brackets) {
Zvi Devir's avatar
Zvi Devir committed
165
	s/(?<!\[)(ו?ש?[בהלמ]?(סעיף|סעיפים|תקנה|תקנות|פרט|פרטים|אמו?ת[- ]מידה)\s$num_sig)(?!\])/[[$1]]/g;
166 167 168 169 170 171 172
	pos = 0;
	my $repeat = 0;
	while ($repeat || m/\[\[(.*?)\]\]/gc) {
		$repeat = 0;
		next if /\G[,; ]*\[\[/;
		my $pos = $+[1];
		pos = $pos;
Zvi Devir's avatar
Zvi Devir committed
173
		# m/(.{0,20})\G(.{0,20})/; print STDERR "POS is $pos\t ... $1<-|->$2 ...\n" if ($debug);
174
		
Zvi Devir's avatar
Zvi Devir committed
175
		0	|| s/\G\]\],\s($num_sig)/]], [[$1]]/
Zvi Devir's avatar
Zvi Devir committed
176
			|| s/\G\]\],\s($sub_sig)/, $1]]/
Zvi Devir's avatar
Zvi Devir committed
177
			|| s/\G\]\](,?\s*((ו-|או\s|עד\s|)\([א-ת\d]+\))+)/$1]]/
Zvi Devir's avatar
Zvi Devir committed
178 179
			|| s/\G\]\](\sעד\s$num_sig|\s(?:ו-|או\s|עד\s)\(\d\S*?\))/$1]]/
			|| s/\G\]\]\s((?:ו-|או\s|עד\s)$num_sig)(?!\])/]] [[$1]]/
180 181 182
			|| next;
		
		pos = $pos;
Zvi Devir's avatar
Zvi Devir committed
183
		# m/(.{0,20})\G(.{0,20})/; print STDERR "\t\t ... $1<-|->$2 ...\n" if ($debug);
184 185 186 187 188
		
		$repeat = 1;
		m/(.*?)\]\]/gc;
	}
	
Zvi Devir's avatar
Zvi Devir committed
189
	# s/(פסק([הא]|אות) ((ו-)?\(..?\),?)+) \[\[/[[$1/g;
Zvi Devir's avatar
Zvi Devir committed
190
	
191
	s/(?<!\[)(ו?ש?[בהלמ]?(פרק|פרקים|סימן|סימנים|תוספת) ה?(ז[הו]|$num_sig|$ext_sig)[^ ,.:;\n\[\]]{0,8}+)(?![\]:])/[[$1]]/g;
192
	s/(?<!\[)(ו?ש?[בהלמ]?אות[והם] (סעיף(?! קטן)|סעיפים(?! קטנים)|פרק|פרקים|סימן|סימנים|תוספת))(?![\]:])/[[$1]]/g;
Zvi Devir's avatar
Zvi Devir committed
193
	s/(?<!\[)(ו?ש?[בהלמ]?(סעיף|פרק|פרקים|סימן|סימנים|תוספת) האמור[א-ת]*)(?![\]:])/[[$1]]/g;
194
	# s/(?<!\[)(ו?ש?[בהלמ]?(תוספת))\b(?!\])/[[$1]]/g;
195
	s/(?<!\[)($law_sig [^;.\n]{1,100}?(, |-)\d{4})(?!\])/[[$1]]/g;
Zvi Devir's avatar
Zvi Devir committed
196
	s/\]\]( \[(נוסח\sחדש|נוסח\sמשולב)\])/$1]]/g;
Zvi Devir's avatar
Zvi Devir committed
197
	s/\[\[($law_sig [^\[\]].*?) ($law_sig[^\[\]].*)\]\]/$1 [[$2]]/g;
198
	s/\]\] \[\[(?=$law_sig)/ /g;
Zvi Devir's avatar
Zvi Devir committed
199
	
200
	s/\[\[([^\[\]]*+)\[\[(.*?)\]\](.*?)\]\]/[[$1$2$3]]/g;
Zvi Devir's avatar
Zvi Devir committed
201
	s/(\[\[[^\[\]\n]*+)\n([^\[\]\n]*+\]\])/$1 $2/g;
202 203
	s/^(=.*)$/remove_brakets($1)/gme;
}
Zvi Devir's avatar
Zvi Devir committed
204

Zvi Devir's avatar
Zvi Devir committed
205
if (/^\[*(חוק|פקודת|תקנות)\b/s) {
Zvi Devir's avatar
Zvi Devir committed
206
	s/^(.*)\n(.*\d{4})( *\*+| \d|)\n/$1 $2$3\n/s;
Zvi Devir's avatar
Zvi Devir committed
207
	s/^(?:\<שם\>|) *(.*)\n/"<שם> ". remove_brakets($1) . "\n"/se;
Zvi Devir's avatar
Zvi Devir committed
208
	s/^(.*?\n)\n*/$1\n<מקור> ...\n\n/s if (!/<מקור>/);
Zvi Devir's avatar
Zvi Devir committed
209 210
}

Zvi Devir's avatar
Zvi Devir committed
211 212
s/\n*(.*?)\n*$/$1\n/s;
s/\n{3,}/\n\n/g;
Zvi Devir's avatar
Zvi Devir committed
213
s/ +$//mg;
Zvi Devir's avatar
Zvi Devir committed
214

215 216
print STDERR "Output size is " . length($_) . "\n" if $verbose;

Zvi Devir's avatar
Zvi Devir committed
217 218 219 220 221 222 223 224
print $_;

exit;
1;


sub fix_description {
	my $_ = shift;
225 226 227 228 229 230 231 232 233 234 235 236 237
	
	s/(?|(\[(תי?קון|תיקונים)\b([^\[\]]+|\[.*?\])+\])|(\((תי?קון|תיקונים)\b([^\(\)]+|\(.*?\))+\))|(\[(תי?קון|תיקונים)\b.*)$)/(FIXSTR)/;
#	s/(\[(?:תי?קון|תיקונים)\b:? *(?:[^\[\]]+|\[.*?\])+\])/(FIXSTR)/ ||
#		s/(\((?:תי?קון|תיקונים)\b:? *(?:[^\(\)]+|\(.*?\))+\))/(FIXSTR)/ ||
#		s/([\[\(](?:תי?קון|תיקונים)\b.*)$/(FIXSTR)/;
	my $fix = $1 // '';
	
	$fix =~ s/^[\(\[](?:תי?קון|תיקונים)\b:? *(.*?)[\)\]]$/$1/;
	$fix =~ s/ה(תש.?".?)/$1/g;
	$fix =~ s/(תש.?".) \(מס' (\d.*?)\)/$1-$2/g;
	while ($fix =~ s/(תש.?".)-(\d[^\,]*|),\s*\(מס' (\d.*?)\)/$1-$2, $1-$3/g) {};
	s/\(FIXSTR\)/(תיקון: $fix)/;
	s/^ *(.*?) *$/$1/;
Zvi Devir's avatar
Zvi Devir committed
238 239 240 241 242 243 244 245 246
	return $_;
}

sub remove_brakets {
	my $_ = shift;
	s/\[\[//;
	s/\]\]//;
	return $_;
}
Zvi Devir's avatar
Zvi Devir committed
247 248 249


sub cleanup {
Zvi Devir's avatar
Zvi Devir committed
250 251
	my $pwd = $0; $pwd =~ s/[^\/]*$//;
	my @cmd = ("$pwd/clear.pl");
Zvi Devir's avatar
Zvi Devir committed
252 253 254 255 256
	my $in = shift;
	my $out;
	run \@cmd, \$in, \$out, *STDERR;
	return decode_utf8($out);
}