1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19 package net.sf.okapi.lib.segmentation;
20
21 import java.io.File;
22 import java.io.IOException;
23 import java.io.InputStream;
24 import java.io.StringReader;
25 import java.io.StringWriter;
26 import java.util.ArrayList;
27 import java.util.LinkedHashMap;
28 import java.util.regex.Pattern;
29
30 import javax.xml.parsers.DocumentBuilder;
31 import javax.xml.parsers.DocumentBuilderFactory;
32 import javax.xml.parsers.ParserConfigurationException;
33 import javax.xml.xpath.XPath;
34 import javax.xml.xpath.XPathConstants;
35 import javax.xml.xpath.XPathExpression;
36 import javax.xml.xpath.XPathExpressionException;
37 import javax.xml.xpath.XPathFactory;
38
39 import org.slf4j.Logger;
40 import org.slf4j.LoggerFactory;
41 import org.w3c.dom.Document;
42 import org.w3c.dom.Element;
43 import org.w3c.dom.Node;
44 import org.w3c.dom.NodeList;
45 import org.xml.sax.InputSource;
46 import org.xml.sax.SAXException;
47
48 import net.sf.okapi.common.DefaultEntityResolver;
49 import net.sf.okapi.common.ISegmenter;
50 import net.sf.okapi.common.LocaleId;
51 import net.sf.okapi.common.NSContextManager;
52 import net.sf.okapi.common.Util;
53 import net.sf.okapi.common.XMLWriter;
54 import net.sf.okapi.common.exceptions.OkapiException;
55 import net.sf.okapi.common.exceptions.OkapiIOException;
56 import net.sf.okapi.common.resource.TextFragment;
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76 public class SRXDocument {
77 private final Logger LOGGER = LoggerFactory.getLogger(getClass());
78
79 private static final String NSURI_SRX20 = "http://www.lisa.org/srx20";
80 private static final String NSURI_SRX10 = "http://www.lisa.org/srx10";
81 private static final String NSURI_OKPSRX = "http://okapi.sf.net/srx-extensions";
82 private static final String NSPREFIX_OKPSRX = "okpsrx";
83 private static final String DEFAULT_SRX_FILE = "defaultSegmentation.srx";
84
85 public static final String DEFAULT_SRX_RULES = "DEFAULT_SRX_RULES";
86
87
88
89
90 public static final String INLINECODE_PATTERN = String.format("([\\u%X\\u%X\\u%X].)",
91 TextFragment.MARKER_OPENING,
92 TextFragment.MARKER_CLOSING, TextFragment.MARKER_ISOLATED);
93
94
95
96
97
98 public static final String ANYCODE = "\\Y";
99
100
101
102
103
104 public static final String NOAUTO = "[noauto]";
105
106
107
108 private static final String AUTO_INLINECODES = "(" + INLINECODE_PATTERN + "*)";
109
110 private boolean cascade;
111 private boolean segmentSubFlows;
112 private boolean includeStartCodes;
113 private boolean includeEndCodes;
114 private boolean includeIsolatedCodes;
115 private boolean oneSegmentIncludesAll;
116 private boolean trimLeadingWS;
117 private boolean trimTrailingWS;
118 private boolean useJavaRegex = true;
119 private boolean useIcu4JBreakRules = false;
120 private boolean treatIsolatedCodesAsWhitespace;
121 private String version = "2.0";
122 private String warning;
123 private String sampleText;
124 private String sampleLanguage;
125 private boolean modified;
126 private boolean testOnSelectedGroup;
127 private ArrayList<LanguageMap> langMaps;
128 private LinkedHashMap<String, ArrayList<Rule>> langRules;
129 private String maskRule;
130 private String docComment;
131 private String headerComment;
132
133
134
135
136 public SRXDocument() {
137 resetAll();
138 }
139
140
141
142
143
144
145 public String getVersion() {
146 return version;
147 }
148
149
150
151
152
153
154 public boolean hasWarning() {
155 return ((warning != null) && (warning.length() > 0));
156 }
157
158
159
160
161
162
163 public String getWarning() {
164 if (warning == null)
165 return "";
166 else
167 return warning;
168 }
169
170
171
172
173
174
175
176 public String getHeaderComments() {
177 return headerComment;
178 }
179
180
181
182
183
184
185
186
187 public void setHeaderComments(String text) {
188 headerComment = text;
189 if ((headerComment != null) && (headerComment.length() == 0)) {
190 headerComment = null;
191 }
192 }
193
194
195
196
197
198
199 public String getComments() {
200 return docComment;
201 }
202
203
204
205
206
207
208
209
210 public void setComments(String text) {
211 docComment = text;
212 if ((docComment != null) && (docComment.length() == 0)) {
213 docComment = null;
214 }
215 }
216
217
218
219
220 public void resetAll() {
221 langMaps = new ArrayList<>();
222 langRules = new LinkedHashMap<>();
223 maskRule = null;
224 modified = false;
225
226 segmentSubFlows = true;
227 cascade = false;
228 includeStartCodes = false;
229 includeEndCodes = true;
230 includeIsolatedCodes = false;
231
232 oneSegmentIncludesAll = false;
233 trimLeadingWS = false;
234 trimTrailingWS = false;
235 useJavaRegex = true;
236 useIcu4JBreakRules = false;
237 treatIsolatedCodesAsWhitespace = false;
238
239 sampleText = "Mr. Holmes is from the U.K. not the U.S. <B>Is Dr. Watson from there too?</B> Yes: both are.<BR/>";
240 sampleLanguage = "en";
241 headerComment = null;
242 docComment = null;
243 }
244
245
246
247
248
249
250 public LinkedHashMap<String, ArrayList<Rule>> getAllLanguageRules() {
251 return langRules;
252 }
253
254
255
256
257
258
259
260
261 public ArrayList<Rule> getLanguageRules(String ruleName) {
262 return langRules.get(ruleName);
263 }
264
265
266
267
268
269
270 public ArrayList<LanguageMap> getAllLanguagesMaps() {
271 return langMaps;
272 }
273
274
275
276
277
278
279 public boolean segmentSubFlows() {
280 return segmentSubFlows;
281 }
282
283
284
285
286
287
288
289 public void setSegmentSubFlows(boolean value) {
290 segmentSubFlows = value;
291 }
292
293
294
295
296
297
298
299 public boolean cascade() {
300 return cascade;
301 }
302
303
304
305
306
307
308
309
310 public void setCascade(boolean value) {
311 if (value != cascade) {
312 cascade = value;
313 modified = true;
314 }
315 }
316
317
318
319
320
321
322
323
324 public boolean oneSegmentIncludesAll() {
325 return oneSegmentIncludesAll;
326 }
327
328
329
330
331
332
333
334
335
336
337 public void setOneSegmentIncludesAll(boolean value) {
338 if (value != oneSegmentIncludesAll) {
339 oneSegmentIncludesAll = value;
340 modified = true;
341 }
342 }
343
344
345
346
347
348
349 public boolean useIcu4JBreakRules() {
350 return useIcu4JBreakRules;
351 }
352
353
354
355
356
357
358
359
360
361
362 public void setUseICU4JBreakRules(boolean value) {
363 if (useIcu4JBreakRules != value) {
364 useIcu4JBreakRules = value;
365 modified = true;
366 }
367 }
368
369
370
371
372
373
374
375 public boolean treatIsolatedCodesAsWhitespace() {
376 return treatIsolatedCodesAsWhitespace;
377 }
378
379
380
381
382
383
384
385
386 public void setTreatIsolatedCodesAsWhitespace(boolean value) {
387 if (value != treatIsolatedCodesAsWhitespace) {
388 treatIsolatedCodesAsWhitespace = value;
389 modified = true;
390 }
391 }
392
393
394
395
396
397
398 public boolean trimLeadingWhitespaces() {
399 return trimLeadingWS;
400 }
401
402
403
404
405
406
407
408
409 public void setTrimLeadingWhitespaces(boolean value) {
410 if (value != trimLeadingWS) {
411 trimLeadingWS = value;
412 modified = true;
413 }
414 }
415
416
417
418
419
420
421 public boolean trimTrailingWhitespaces() {
422 return trimTrailingWS;
423 }
424
425
426
427
428
429
430
431
432 public void setTrimTrailingWhitespaces(boolean value) {
433 if (value != trimTrailingWS) {
434 trimTrailingWS = value;
435 modified = true;
436 }
437 }
438
439
440
441
442
443
444
445 public boolean includeStartCodes() {
446 return includeStartCodes;
447 }
448
449
450
451
452
453
454
455
456 public void setIncludeStartCodes(boolean value) {
457 if (value != includeStartCodes) {
458 includeStartCodes = value;
459 modified = true;
460 }
461 }
462
463
464
465
466
467
468 public boolean includeEndCodes() {
469 return includeEndCodes;
470 }
471
472
473
474
475
476
477
478
479 public void setIncludeEndCodes(boolean value) {
480 if (value != includeEndCodes) {
481 includeEndCodes = value;
482 modified = true;
483 }
484 }
485
486
487
488
489
490
491
492 public boolean includeIsolatedCodes() {
493 return includeIsolatedCodes;
494 }
495
496
497
498
499
500
501
502
503 public void setIncludeIsolatedCodes(boolean value) {
504 if (value != includeIsolatedCodes) {
505 includeIsolatedCodes = value;
506 modified = true;
507 }
508 }
509
510
511
512
513
514
515 public String getMaskRule() {
516 return maskRule;
517 }
518
519
520
521
522
523
524
525 public void setMaskRule(String pattern) {
526 if (pattern != null) {
527 if (!pattern.equals(maskRule)) {
528 modified = true;
529 }
530 } else if (maskRule != null) {
531 modified = true;
532 }
533 maskRule = pattern;
534 }
535
536
537
538
539
540
541
542
543 public String getSampleText() {
544 if (sampleText == null)
545 return "";
546 else
547 return sampleText;
548 }
549
550
551
552
553
554
555
556 public void setSampleText(String value) {
557 if (value != null) {
558 if (!value.equals(sampleText)) {
559 modified = true;
560 }
561 } else if (sampleText != null) {
562 modified = true;
563 }
564 sampleText = value;
565 }
566
567
568
569
570
571
572 public String getSampleLanguage() {
573 return sampleLanguage;
574 }
575
576
577
578
579
580
581
582
583 public void setSampleLanguage(String value) {
584 if ((value == null) || (value.length() == 0)) {
585 sampleLanguage = "en";
586 modified = true;
587 } else {
588 if (!value.equals(sampleLanguage)) {
589 sampleLanguage = value;
590 modified = true;
591 }
592 }
593 }
594
595
596
597
598
599
600
601
602 public boolean testOnSelectedGroup() {
603 return testOnSelectedGroup;
604 }
605
606
607
608
609
610
611
612
613 public void setTestOnSelectedGroup(boolean value) {
614 if (value != testOnSelectedGroup) {
615 testOnSelectedGroup = value;
616 modified = true;
617 }
618 }
619
620
621
622
623
624
625 public boolean isModified() {
626 return modified;
627 }
628
629
630
631
632
633
634
635
636
637 public void setModified(boolean value) {
638 modified = value;
639 }
640
641
642
643
644
645
646
647
648
649
650
651 public void addLanguageRule(String name, ArrayList<Rule> langRule) {
652 langRules.put(name, langRule);
653 modified = true;
654 }
655
656
657
658
659
660
661
662
663 public void addLanguageMap(LanguageMap langMap) {
664 langMaps.add(langMap);
665 modified = true;
666 }
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684 public ISegmenter compileLanguageRules(LocaleId languageCode, ISegmenter existingSegmenter) {
685 SRXSegmenter segmenter = null;
686 if ((existingSegmenter != null) && (existingSegmenter instanceof SRXSegmenter)) {
687 segmenter = (SRXSegmenter) existingSegmenter;
688 }
689
690 if (segmenter != null) {
691
692 if (languageCode != null) {
693 if (languageCode.equals(segmenter.getLanguage()) && (cascade == segmenter.cascade()))
694 return segmenter;
695 }
696 segmenter.reset();
697 } else {
698 segmenter = new SRXSegmenter();
699 }
700
701 segmenter.setCascade(cascade);
702 segmenter.setOptions(segmentSubFlows, includeStartCodes, includeEndCodes, includeIsolatedCodes,
703 oneSegmentIncludesAll, trimLeadingWS, trimTrailingWS, useJavaRegex, useIcu4JBreakRules,
704 treatIsolatedCodesAsWhitespace);
705
706 for (LanguageMap langMap : langMaps) {
707 if (Pattern.matches(langMap.pattern, languageCode.toString())) {
708 compileRules(segmenter, langMap.ruleName);
709 if (!segmenter.cascade())
710 break;
711 }
712 }
713
714 segmenter.setLanguage(languageCode);
715 return segmenter;
716 }
717
718
719
720
721
722
723
724
725
726
727
728 public ISegmenter compileSingleLanguageRule(String ruleName, ISegmenter existingSegmenter) {
729 SRXSegmenter segmenter = null;
730 if ((existingSegmenter != null) && (existingSegmenter instanceof SRXSegmenter)) {
731 segmenter = (SRXSegmenter) existingSegmenter;
732 }
733
734 if (segmenter != null) {
735
736 if (ruleName != null) {
737 if (segmenter.getLanguage().equals(LocaleId.EMPTY))
738 return segmenter;
739 }
740 segmenter.reset();
741 } else {
742 segmenter = new SRXSegmenter();
743 }
744
745 segmenter.setOptions(segmentSubFlows, includeStartCodes, includeEndCodes, includeIsolatedCodes,
746 oneSegmentIncludesAll, trimLeadingWS, trimTrailingWS, useJavaRegex, useIcu4JBreakRules,
747 treatIsolatedCodesAsWhitespace);
748 compileRules(segmenter, ruleName);
749 segmenter.setLanguage(LocaleId.EMPTY);
750 return segmenter;
751 }
752
753
754
755
756
757
758
759 private void compileRules(SRXSegmenter segmenter, String ruleName) {
760 if (!langRules.containsKey(ruleName) && !useIcu4JBreakRules) {
761 throw new SegmentationRuleException("language rule '" + ruleName + "' not found.");
762 }
763 ArrayList<Rule> langRule = langRules.get(ruleName);
764 String pattern = null;
765 for (Rule rule : langRule) {
766 if (rule.isActive) {
767
768 pattern = generateRuleRegex(rule);
769 pattern = pattern.replace(ANYCODE, INLINECODE_PATTERN);
770
771
772 segmenter.addRule(new CompiledRule(pattern, rule.isBreak));
773 }
774 }
775
776
777 segmenter.setMaskRule(maskRule);
778 }
779
780 public String generateRuleRegex(Rule rule) {
781 String pattern = "";
782 String beforePattern = "";
783 String afterPattern = "";
784
785 afterPattern = "(" + rule.after + ")";
786 if (rule.before.endsWith(NOAUTO)) {
787
788
789 beforePattern = "(" + rule.before.substring(0, rule.before.length() - NOAUTO.length()) + ")";
790 } else {
791
792
793
794
795 beforePattern = "(" + rule.before + AUTO_INLINECODES + ")";
796 }
797
798 if (rule.before.isEmpty()) {
799
800 pattern = "(.|\n)" + afterPattern;
801 } else if (rule.after.isEmpty()) {
802
803 pattern = beforePattern + "()";
804 } else {
805 pattern = beforePattern + afterPattern;
806 }
807
808 return pattern;
809
810 }
811
812
813
814
815
816
817
818
819
820
821 public void loadRules(CharSequence data) {
822 loadRules(data, 1);
823 modified = true;
824 }
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841 public void loadRules(String pathOrURL) {
842 if (DEFAULT_SRX_RULES.equals(pathOrURL)) {
843 loadRules(getClass().getResourceAsStream(DEFAULT_SRX_FILE));
844 } else {
845 loadRules(pathOrURL, 0);
846 }
847 }
848
849
850
851
852
853
854
855
856
857
858 public void loadRules(InputStream inputStream) {
859 loadRules(inputStream, 2);
860 }
861
862 private void loadRules(Object input, int inputType) {
863 try {
864 DocumentBuilderFactory Fact = DocumentBuilderFactory.newInstance();
865 Fact.setValidating(false);
866 Fact.setNamespaceAware(true);
867
868
869 try {
870
871 Fact.setFeature("http://apache.org/xml/features/disallow-doctype-decl", false);
872
873
874 Fact.setFeature("http://xml.org/sax/features/external-general-entities", false);
875 Fact.setFeature("http://xml.org/sax/features/external-parameter-entities", false);
876 Fact.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
877
878
879 Fact.setXIncludeAware(false);
880 Fact.setExpandEntityReferences(false);
881
882 } catch (ParserConfigurationException e) {
883 LOGGER.warn("Unsupported XML feature on this platform, falling back safely.", e);
884 }
885
886 DocumentBuilder docBuilder = Fact.newDocumentBuilder();
887 docBuilder.setEntityResolver(new DefaultEntityResolver());
888
889 Document doc;
890
891 if (inputType == 0) {
892
893 String pathOrURL = (String) input;
894 File srxFile = new File(Util.toURI(pathOrURL));
895 if (!srxFile.exists()) {
896 throw new OkapiException("SRX file not found");
897 }
898
899
900 try (InputStream in = new java.io.FileInputStream(srxFile)) {
901 net.sf.okapi.common.BOMNewlineEncodingDetector detector = new net.sf.okapi.common.BOMNewlineEncodingDetector(
902 in, java.nio.charset.StandardCharsets.UTF_8);
903 detector.detectAndRemoveBom();
904 doc = docBuilder.parse(detector.getInputStream());
905 }
906
907 } else if (inputType == 1) {
908
909 CharSequence data = (CharSequence) input;
910 doc = docBuilder.parse(new InputSource(new StringReader(data.toString())));
911
912 } else {
913
914 InputStream inputStream = (InputStream) input;
915 net.sf.okapi.common.BOMNewlineEncodingDetector detector = new net.sf.okapi.common.BOMNewlineEncodingDetector(
916 inputStream, java.nio.charset.StandardCharsets.UTF_8);
917 detector.detectAndRemoveBom();
918 doc = docBuilder.parse(detector.getInputStream());
919 }
920
921 resetAll();
922
923 XPathFactory xpathFac = Util.createXPathFactory();
924 XPath xpath = xpathFac.newXPath();
925 NSContextManager nsContext = new NSContextManager();
926 nsContext.add("srx", NSURI_SRX20);
927 nsContext.add(NSPREFIX_OKPSRX, NSURI_OKPSRX);
928 nsContext.add("srx1", NSURI_SRX10);
929 xpath.setNamespaceContext(nsContext);
930
931
932 String ns = NSURI_SRX20;
933 XPathExpression xpe = xpath.compile("//srx:srx");
934 NodeList srxList = (NodeList) xpe.evaluate(doc, XPathConstants.NODESET);
935 if (srxList.getLength() < 1) {
936 xpe = xpath.compile("//srx1:srx");
937 srxList = (NodeList) xpe.evaluate(doc, XPathConstants.NODESET);
938 if (srxList.getLength() < 1) {
939 xpe = xpath.compile("//srx");
940 srxList = (NodeList) xpe.evaluate(doc, XPathConstants.NODESET);
941 if (srxList.getLength() < 1) {
942 return;
943 }
944 ns = "";
945 } else
946 ns = NSURI_SRX10;
947 }
948
949
950 Element srxElem = (Element) srxList.item(0);
951 docComment = getPreviousComments(srxElem, null);
952 String tmp = srxElem.getAttribute("version");
953 if (tmp.equals("1.0")) {
954 version = tmp;
955 warning = "SRX version 1.0 rules are subject to different interpretation.\nRead the help for more information.";
956 } else if (tmp.equals("2.0")) {
957 version = tmp;
958 warning = null;
959 } else
960 throw new OkapiIOException("Invalid version value.");
961
962 Element elem1 = getFirstElementByTagNameNS(ns, "header", srxElem);
963 headerComment = getPreviousComments(elem1, null);
964
965 tmp = elem1.getAttribute("segmentsubflows");
966 if (tmp.length() > 0)
967 segmentSubFlows = "yes".equals(tmp);
968 tmp = elem1.getAttribute("cascade");
969 if (tmp.length() > 0)
970 cascade = "yes".equals(tmp);
971
972
973 NodeList list2 = elem1.getElementsByTagNameNS(ns, "formathandle");
974 for (int i = 0; i < list2.getLength(); i++) {
975 Element elem2 = (Element) list2.item(i);
976 tmp = elem2.getAttribute("type");
977 if ("start".equals(tmp)) {
978 tmp = elem2.getAttribute("include");
979 if (tmp.length() > 0)
980 includeStartCodes = "yes".equals(tmp);
981 } else if ("end".equals(tmp)) {
982 tmp = elem2.getAttribute("include");
983 if (tmp.length() > 0)
984 includeEndCodes = "yes".equals(tmp);
985 } else if ("isolated".equals(tmp)) {
986 tmp = elem2.getAttribute("include");
987 if (tmp.length() > 0)
988 includeIsolatedCodes = "yes".equals(tmp);
989 }
990 }
991
992
993 Element elem2 = getFirstElementByTagNameNS(NSURI_OKPSRX, "options", elem1);
994 if (elem2 != null) {
995 tmp = elem2.getAttribute("oneSegmentIncludesAll");
996 if (tmp.length() > 0)
997 oneSegmentIncludesAll = "yes".equals(tmp);
998
999 tmp = elem2.getAttribute("trimLeadingWhitespaces");
1000 if (tmp.length() > 0)
1001 trimLeadingWS = "yes".equals(tmp);
1002
1003 tmp = elem2.getAttribute("trimTrailingWhitespaces");
1004 if (tmp.length() > 0)
1005 trimTrailingWS = "yes".equals(tmp);
1006
1007 tmp = elem2.getAttribute("useJavaRegex");
1008 if (tmp.length() > 0)
1009 useJavaRegex = true;
1010
1011 tmp = elem2.getAttribute("useIcu4jBreakRules");
1012 if (tmp.length() > 0)
1013 useIcu4JBreakRules = "yes".equals(tmp);
1014
1015 tmp = elem2.getAttribute("treatIsolatedCodesAsWhitespace");
1016 if (tmp.length() > 0)
1017 treatIsolatedCodesAsWhitespace = "yes".equals(tmp);
1018 }
1019
1020
1021 elem2 = getFirstElementByTagNameNS(NSURI_OKPSRX, "sample", elem1);
1022 if (elem2 != null) {
1023 setSampleText(Util.getTextContent(elem2));
1024 tmp = elem2.getAttribute("language");
1025 if (tmp.length() > 0)
1026 setSampleLanguage(tmp);
1027 tmp = elem2.getAttribute("useMappedRules");
1028 if (tmp.length() > 0)
1029 setTestOnSelectedGroup("no".equals(tmp));
1030 }
1031
1032
1033 elem2 = getFirstElementByTagNameNS(NSURI_OKPSRX, "rangeRule", elem1);
1034 if (elem2 != null) {
1035 setMaskRule(Util.getTextContent(elem2));
1036 }
1037
1038
1039 elem1 = getFirstElementByTagNameNS(ns, "body", srxElem);
1040
1041
1042 elem2 = getFirstElementByTagNameNS(ns, "languagerules", elem1);
1043 if (elem2 == null) {
1044 throw new OkapiException("the languagerules element is missing.");
1045 }
1046
1047 list2 = elem2.getElementsByTagNameNS(ns, "languagerule");
1048 for (int i = 0; i < list2.getLength(); i++) {
1049 Element elem3 = (Element) list2.item(i);
1050 ArrayList<Rule> tmpList = new ArrayList<>();
1051 String ruleName = elem3.getAttribute("languagerulename");
1052
1053 NodeList list3 = elem3.getElementsByTagNameNS(ns, "rule");
1054 for (int j = 0; j < list3.getLength(); j++) {
1055 Element elem4 = (Element) list3.item(j);
1056 Rule newRule = new Rule();
1057 newRule.comment = getPreviousComments(elem4, "rule");
1058 tmp = elem4.getAttribute("break");
1059 if (tmp.length() > 0)
1060 newRule.isBreak = "yes".equals(tmp);
1061 tmp = elem4.getAttributeNS(NSURI_OKPSRX, "active");
1062 if (tmp.length() > 0)
1063 newRule.isActive = "yes".equals(tmp);
1064 Element elem5 = getFirstElementByTagNameNS(ns, "beforebreak", elem4);
1065 if (elem5 != null)
1066 newRule.before = Util.getTextContent(elem5);
1067 elem5 = getFirstElementByTagNameNS(ns, "afterbreak", elem4);
1068 if (elem5 != null)
1069 newRule.after = Util.getTextContent(elem5);
1070 tmpList.add(newRule);
1071 }
1072 langRules.put(ruleName, tmpList);
1073 }
1074
1075
1076 elem2 = getFirstElementByTagNameNS(ns, "maprules", elem1);
1077
1078 list2 = elem2.getElementsByTagNameNS(ns, "languagemap");
1079 for (int i = 0; i < list2.getLength(); i++) {
1080 Element elem3 = (Element) list2.item(i);
1081 LanguageMap langMap = new LanguageMap();
1082 tmp = elem3.getAttribute("languagepattern");
1083 if (tmp.length() > 0)
1084 langMap.pattern = tmp;
1085 tmp = elem3.getAttribute("languagerulename");
1086 if (tmp.length() > 0)
1087 langMap.ruleName = tmp;
1088 langMaps.add(langMap);
1089 }
1090 modified = false;
1091 } catch (SAXException | XPathExpressionException | IOException
1092 | ParserConfigurationException e) {
1093 throw new OkapiIOException(e);
1094 }
1095 }
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108 private String getPreviousComments(Node startNode, String stopElement) {
1109 Node node = startNode.getPreviousSibling();
1110 while (node != null) {
1111 switch (node.getNodeType()) {
1112 case Node.COMMENT_NODE:
1113 return node.getNodeValue();
1114 case Node.ELEMENT_NODE:
1115 if ((stopElement != null) && (node.getNodeName().equals(stopElement))) {
1116 return null;
1117 }
1118 break;
1119 }
1120 node = node.getPreviousSibling();
1121 }
1122 return null;
1123 }
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137 private Element getFirstElementByTagNameNS(String ns, String tagName, Element elem) {
1138 NodeList list = elem.getElementsByTagNameNS(ns, tagName);
1139 if ((list == null) || (list.getLength() < 1))
1140 return null;
1141 return (Element) list.item(0);
1142 }
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153 public String saveRulesToString(boolean saveExtensions, boolean saveNonValidInfo) {
1154 StringWriter strWriter = new StringWriter();
1155 XMLWriter writer = new XMLWriter(strWriter);
1156 boolean current = modified;
1157 saveRules(writer, saveExtensions, saveNonValidInfo);
1158 modified = current;
1159 writer.close();
1160 return strWriter.toString();
1161 }
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173 public void saveRules(String rulesPath, boolean saveExtensions, boolean saveNonValidInfo) {
1174 XMLWriter writer = new XMLWriter(rulesPath);
1175 saveRules(writer, saveExtensions, saveNonValidInfo);
1176 }
1177
1178 private void saveRules(XMLWriter writer, boolean saveExtensions, boolean saveNonValidInfo) {
1179 try {
1180 writer.writeStartDocument();
1181 if (docComment != null) {
1182 writer.writeComment(docComment, true);
1183 }
1184 writer.writeStartElement("srx");
1185 writer.writeAttributeString("xmlns", NSURI_SRX20);
1186 if (saveExtensions) {
1187 writer.writeAttributeString("xmlns:" + NSPREFIX_OKPSRX, NSURI_OKPSRX);
1188 }
1189 writer.writeAttributeString("version", "2.0");
1190 version = "2.0";
1191 writer.writeLineBreak();
1192
1193 if (headerComment != null) {
1194 writer.writeComment(headerComment, true);
1195 }
1196 writer.writeStartElement("header");
1197 writer.writeAttributeString("segmentsubflows", (segmentSubFlows ? "yes" : "no"));
1198 writer.writeAttributeString("cascade", (cascade ? "yes" : "no"));
1199 writer.writeLineBreak();
1200
1201 writer.writeStartElement("formathandle");
1202 writer.writeAttributeString("type", "start");
1203 writer.writeAttributeString("include", (includeStartCodes ? "yes" : "no"));
1204 writer.writeEndElementLineBreak();
1205
1206 writer.writeStartElement("formathandle");
1207 writer.writeAttributeString("type", "end");
1208 writer.writeAttributeString("include", (includeEndCodes ? "yes" : "no"));
1209 writer.writeEndElementLineBreak();
1210
1211 writer.writeStartElement("formathandle");
1212 writer.writeAttributeString("type", "isolated");
1213 writer.writeAttributeString("include", (includeIsolatedCodes ? "yes" : "no"));
1214 writer.writeEndElementLineBreak();
1215
1216 if (saveExtensions) {
1217 writer.writeStartElement(NSPREFIX_OKPSRX + ":options");
1218 writer.writeAttributeString("oneSegmentIncludesAll",
1219 (oneSegmentIncludesAll ? "yes" : "no"));
1220 writer.writeAttributeString("trimLeadingWhitespaces", (trimLeadingWS ? "yes" : "no"));
1221 writer.writeAttributeString("trimTrailingWhitespaces", (trimTrailingWS ? "yes" : "no"));
1222 writer.writeAttributeString("useJavaRegex", "yes");
1223 writer.writeAttributeString("useIcu4JBreakRules",
1224 (useIcu4JBreakRules ? "yes" : "no"));
1225 writer.writeAttributeString("treatIsolatedCodesAsWhitespace",
1226 (treatIsolatedCodesAsWhitespace ? "yes" : "no"));
1227 writer.writeEndElementLineBreak();
1228
1229 writer.writeStartElement(NSPREFIX_OKPSRX + ":sample");
1230 writer.writeAttributeString("language", getSampleLanguage());
1231 writer.writeAttributeString("useMappedRules", (testOnSelectedGroup() ? "no" : "yes"));
1232 writer.writeString(getSampleText());
1233 writer.writeEndElementLineBreak();
1234
1235 writer.writeStartElement(NSPREFIX_OKPSRX + ":rangeRule");
1236 writer.writeString(getMaskRule());
1237 writer.writeEndElementLineBreak();
1238 }
1239
1240 writer.writeEndElementLineBreak();
1241
1242 writer.writeStartElement("body");
1243 writer.writeLineBreak();
1244
1245 writer.writeStartElement("languagerules");
1246 writer.writeLineBreak();
1247 for (String ruleName : langRules.keySet()) {
1248 writer.writeStartElement("languagerule");
1249 writer.writeAttributeString("languagerulename", ruleName);
1250 writer.writeLineBreak();
1251 ArrayList<Rule> langRule = langRules.get(ruleName);
1252 for (Rule rule : langRule) {
1253 if (rule.comment != null) {
1254 writer.writeComment(rule.comment, true);
1255 }
1256 writer.writeStartElement("rule");
1257 writer.writeAttributeString("break", (rule.isBreak ? "yes" : "no"));
1258
1259
1260 if (saveExtensions && saveNonValidInfo) {
1261 writer.writeAttributeString(NSPREFIX_OKPSRX + ":active",
1262 (rule.isActive ? "yes" : "no"));
1263 }
1264
1265 writer.writeLineBreak();
1266 writer.writeElementString("beforebreak", rule.before);
1267 writer.writeLineBreak();
1268 writer.writeElementString("afterbreak", rule.after);
1269 writer.writeLineBreak();
1270 writer.writeEndElementLineBreak();
1271 }
1272 writer.writeEndElementLineBreak();
1273 }
1274 writer.writeEndElementLineBreak();
1275
1276 writer.writeStartElement("maprules");
1277 writer.writeLineBreak();
1278 for (LanguageMap langMap : langMaps) {
1279 writer.writeStartElement("languagemap");
1280 writer.writeAttributeString("languagepattern", langMap.pattern);
1281 writer.writeAttributeString("languagerulename", langMap.ruleName);
1282 writer.writeEndElementLineBreak();
1283 }
1284 writer.writeEndElementLineBreak();
1285
1286 writer.writeEndElementLineBreak();
1287
1288 writer.writeEndElementLineBreak();
1289 writer.writeEndDocument();
1290 modified = false;
1291 } finally {
1292 if (writer != null)
1293 writer.close();
1294 }
1295 }
1296
1297 }