View Javadoc
1   /*
2    * ===========================================================================
3    * Copyright (C) 2008-2012 by the Okapi Framework contributors
4    * -----------------------------------------------------------------------------
5    * Licensed under the Apache License, Version 2.0 (the "License");
6    * you may not use this file except in compliance with the License.
7    * You may obtain a copy of the License at
8    * 
9    * http://www.apache.org/licenses/LICENSE-2.0
10   * 
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   * ===========================================================================
17   */
18  
19  package net.sf.okapi.lib.segmentation;
20  
21  import java.io.File;
22  import java.io.IOException;
23  import java.io.InputStream;
24  import java.io.StringReader;
25  import java.io.StringWriter;
26  import java.util.ArrayList;
27  import java.util.LinkedHashMap;
28  import java.util.regex.Pattern;
29  
30  import javax.xml.parsers.DocumentBuilder;
31  import javax.xml.parsers.DocumentBuilderFactory;
32  import javax.xml.parsers.ParserConfigurationException;
33  import javax.xml.xpath.XPath;
34  import javax.xml.xpath.XPathConstants;
35  import javax.xml.xpath.XPathExpression;
36  import javax.xml.xpath.XPathExpressionException;
37  import javax.xml.xpath.XPathFactory;
38  
39  import org.slf4j.Logger;
40  import org.slf4j.LoggerFactory;
41  import org.w3c.dom.Document;
42  import org.w3c.dom.Element;
43  import org.w3c.dom.Node;
44  import org.w3c.dom.NodeList;
45  import org.xml.sax.InputSource;
46  import org.xml.sax.SAXException;
47  
48  import net.sf.okapi.common.DefaultEntityResolver;
49  import net.sf.okapi.common.ISegmenter;
50  import net.sf.okapi.common.LocaleId;
51  import net.sf.okapi.common.NSContextManager;
52  import net.sf.okapi.common.Util;
53  import net.sf.okapi.common.XMLWriter;
54  import net.sf.okapi.common.exceptions.OkapiException;
55  import net.sf.okapi.common.exceptions.OkapiIOException;
56  import net.sf.okapi.common.resource.TextFragment;
57  
58  /**
59   * Provides facilities to load, save, and manage segmentation rules in SRX format.
60   * <p>
61   * <strong>SV: Updates and parser hardening (2025)</strong><br>
62   * This class now integrates two important improvements to handle modern Java XML parsing securely
63   * and reliably:
64   * <ul>
65   * <li>✔ <b>BOM-safe SRX loading</b> — uses {@link net.sf.okapi.common.BOMNewlineEncodingDetector}
66   * to automatically detect and remove any Unicode byte-order mark (BOM) from the input stream before
67   * parsing.
68   * This resolves the common Xerces error “Content is not allowed in prolog.”</li>
69   * <li>✔ <b>Safe DOCTYPE support</b> — allows internal DOCTYPE declarations (needed for SRX DTD),
70   * while disabling external entity resolution and external DTD fetching for security.
71   * This avoids “DOCTYPE is disallowed…” errors while preventing XXE attacks.</li>
72   * </ul>
73   * 
74   * @version 1.47.0
75   */
76  public class SRXDocument {
77    private final Logger LOGGER = LoggerFactory.getLogger(getClass());
78  
79    private static final String NSURI_SRX20 = "http://www.lisa.org/srx20";
80    private static final String NSURI_SRX10 = "http://www.lisa.org/srx10";
81    private static final String NSURI_OKPSRX = "http://okapi.sf.net/srx-extensions";
82    private static final String NSPREFIX_OKPSRX = "okpsrx";
83    private static final String DEFAULT_SRX_FILE = "defaultSegmentation.srx";
84  
85    public static final String DEFAULT_SRX_RULES = "DEFAULT_SRX_RULES";
86  
87    /**
88     * Represents the pattern for an inline code (both special characters).
89     */
90    public static final String INLINECODE_PATTERN = String.format("([\\u%X\\u%X\\u%X].)",
91        TextFragment.MARKER_OPENING,
92        TextFragment.MARKER_CLOSING, TextFragment.MARKER_ISOLATED);
93  
94    /**
95     * Marker for INLINECODE_PATTERN in the given pattern. \Y+ = one or more
96     * codes, \Y* = zero, one or more codes, etc.
97     */
98    public static final String ANYCODE = "\\Y";
99  
100   /**
101    * Placed at the end of the 'after' expression, this marker indicates the
102    * given pattern should not have auto-insertion of AUTO_INLINECODES.
103    */
104   public static final String NOAUTO = "[noauto]";
105 
106   // Represents zero, one or more inline codes. this is used in auto-insertion
107   // cases
108   private static final String AUTO_INLINECODES = "(" + INLINECODE_PATTERN + "*)";
109 
110   private boolean cascade;
111   private boolean segmentSubFlows;
112   private boolean includeStartCodes;
113   private boolean includeEndCodes;
114   private boolean includeIsolatedCodes;
115   private boolean oneSegmentIncludesAll;
116   private boolean trimLeadingWS;
117   private boolean trimTrailingWS;
118   private boolean useJavaRegex = true;
119   private boolean useIcu4JBreakRules = false;
120   private boolean treatIsolatedCodesAsWhitespace;
121   private String version = "2.0";
122   private String warning;
123   private String sampleText;
124   private String sampleLanguage;
125   private boolean modified;
126   private boolean testOnSelectedGroup;
127   private ArrayList<LanguageMap> langMaps;
128   private LinkedHashMap<String, ArrayList<Rule>> langRules;
129   private String maskRule;
130   private String docComment;
131   private String headerComment;
132 
133   /**
134    * Creates an empty SRX document.
135    */
136   public SRXDocument() {
137     resetAll();
138   }
139 
140   /**
141    * Gets the version of this SRX document.
142    * 
143    * @return the version of this SRX document.
144    */
145   public String getVersion() {
146     return version;
147   }
148 
149   /**
150    * Indicates if a warning was issued last time a document was read.
151    * 
152    * @return true if a warning was issued, false otherwise.
153    */
154   public boolean hasWarning() {
155     return ((warning != null) && (warning.length() > 0));
156   }
157 
158   /**
159    * Gets the last warning that was issued while loading a document.
160    * 
161    * @return the text of the last warning issued, or an empty string.
162    */
163   public String getWarning() {
164     if (warning == null)
165       return "";
166     else
167       return warning;
168   }
169 
170   /**
171    * Gets the comments associated with the header of this document.
172    * 
173    * @return the comments for the header of this document, or null if there
174    *         are none.
175    */
176   public String getHeaderComments() {
177     return headerComment;
178   }
179 
180   /**
181    * Sets the comments for the header of this document.
182    * 
183    * @param text
184    *             the new comments, use null or empty string for removing the
185    *             comments.
186    */
187   public void setHeaderComments(String text) {
188     headerComment = text;
189     if ((headerComment != null) && (headerComment.length() == 0)) {
190       headerComment = null;
191     }
192   }
193 
194   /**
195    * Gets the comments associated with this document.
196    * 
197    * @return the comments for this document, or null if there are none.
198    */
199   public String getComments() {
200     return docComment;
201   }
202 
203   /**
204    * Sets the comments for this document.
205    * 
206    * @param text
207    *             the new comments, use null or empty string for removing the
208    *             comments.
209    */
210   public void setComments(String text) {
211     docComment = text;
212     if ((docComment != null) && (docComment.length() == 0)) {
213       docComment = null;
214     }
215   }
216 
217   /**
218    * Resets the document to its default empty initial state.
219    */
220   public void resetAll() {
221     langMaps = new ArrayList<>();
222     langRules = new LinkedHashMap<>();
223     maskRule = null;
224     modified = false;
225 
226     segmentSubFlows = true; // SRX default
227     cascade = false; // There is no SRX default for this
228     includeStartCodes = false; // SRX default
229     includeEndCodes = true; // SRX default
230     includeIsolatedCodes = false; // SRX default
231 
232     oneSegmentIncludesAll = false; // Extension
233     trimLeadingWS = false; // Extension
234     trimTrailingWS = false; // Extension
235     useJavaRegex = true; // Deprecated Extension (always true)
236     useIcu4JBreakRules = false;
237     treatIsolatedCodesAsWhitespace = false; // Extension
238 
239     sampleText = "Mr. Holmes is from the U.K. not the U.S. <B>Is Dr. Watson from there too?</B> Yes: both are.<BR/>";
240     sampleLanguage = "en";
241     headerComment = null;
242     docComment = null;
243   }
244 
245   /**
246    * Gets a map of all the language rules in this document.
247    * 
248    * @return a map of all the language rules.
249    */
250   public LinkedHashMap<String, ArrayList<Rule>> getAllLanguageRules() {
251     return langRules;
252   }
253 
254   /**
255    * Gets the list of rules for a given &lt;languagerule7gt; element.
256    * 
257    * @param ruleName
258    *                 the name of the &lt;languagerulegt; element to query.
259    * @return the list of rules for a given &lt;languagerulegt; element.
260    */
261   public ArrayList<Rule> getLanguageRules(String ruleName) {
262     return langRules.get(ruleName);
263   }
264 
265   /**
266    * Gets the list of all the language maps in this document.
267    * 
268    * @return the list of all the language maps.
269    */
270   public ArrayList<LanguageMap> getAllLanguagesMaps() {
271     return langMaps;
272   }
273 
274   /**
275    * Indicates if sub-flows must be segmented.
276    * 
277    * @return true if sub-flows must be segmented, false otherwise.
278    */
279   public boolean segmentSubFlows() {
280     return segmentSubFlows;
281   }
282 
283   /**
284    * Sets the flag indicating if sub-flows must be segmented.
285    * 
286    * @param value
287    *              true if sub-flows must be segmented, false otherwise.
288    */
289   public void setSegmentSubFlows(boolean value) {
290     segmentSubFlows = value;
291   }
292 
293   /**
294    * Indicates if cascading must be applied when selecting the rules for a
295    * given language pattern.
296    * 
297    * @return true if cascading must be applied, false otherwise.
298    */
299   public boolean cascade() {
300     return cascade;
301   }
302 
303   /**
304    * Sets the flag indicating if cascading must be applied when selecting the
305    * rules for a given language pattern.
306    * 
307    * @param value
308    *              true if cascading must be applied, false otherwise.
309    */
310   public void setCascade(boolean value) {
311     if (value != cascade) {
312       cascade = value;
313       modified = true;
314     }
315   }
316 
317   /**
318    * Indicates if, when there is a single segment in a text, it should include
319    * the whole text (no spaces or codes trim left/right)
320    * 
321    * @return true if a text with a single segment should include the whole
322    *         text.
323    */
324   public boolean oneSegmentIncludesAll() {
325     return oneSegmentIncludesAll;
326   }
327 
328   /**
329    * Sets the indicator that tells if when there is a single segment in a text
330    * it should include the whole text (no spaces or codes trim left/right)
331    * text.
332    * 
333    * @param value
334    *              true if a text with a single segment should include the whole
335    *              text.
336    */
337   public void setOneSegmentIncludesAll(boolean value) {
338     if (value != oneSegmentIncludesAll) {
339       oneSegmentIncludesAll = value;
340       modified = true;
341     }
342   }
343 
344   /**
345    * Indicates if this document uses ICU4J break rules.
346    * 
347    * @return true if ICU4J break rules are used, false otherwise.
348    */
349   public boolean useIcu4JBreakRules() {
350     return useIcu4JBreakRules;
351   }
352 
353   /**
354    * Sets the indicator that tells if this document uses ICU4J BreakIterator rules.
355    * {@link com.ibm.icu.text.BreakIterator} break positions are converted to SRX-like rules and used
356    * as default rules for all languages.
357    * 
358    * @param value
359    *              true if ICU4J rules should be used as defaults
360    *              expression, false if no ICU4J rules should be used
361    */
362   public void setUseICU4JBreakRules(boolean value) {
363     if (useIcu4JBreakRules != value) {
364       useIcu4JBreakRules = value;
365       modified = true;
366     }
367   }
368 
369   /**
370    * Indicates if this document should treat isolated codes as whitespace when
371    * matching SRX rules.
372    * 
373    * @return true if isolated codes should be treated as whitespace
374    */
375   public boolean treatIsolatedCodesAsWhitespace() {
376     return treatIsolatedCodesAsWhitespace;
377   }
378 
379   /**
380    * Sets the indicator if this document should treat isolated codes as
381    * whitespace when matching SRX rules.
382    * 
383    * @param value
384    *              true if isolated codes should be treated as whitespace
385    */
386   public void setTreatIsolatedCodesAsWhitespace(boolean value) {
387     if (value != treatIsolatedCodesAsWhitespace) {
388       treatIsolatedCodesAsWhitespace = value;
389       modified = true;
390     }
391   }
392 
393   /**
394    * Indicates if leading white-spaces should be left outside the segments.
395    * 
396    * @return true if the leading white-spaces should be trimmed.
397    */
398   public boolean trimLeadingWhitespaces() {
399     return trimLeadingWS;
400   }
401 
402   /**
403    * Sets the indicator that tells if leading white-spaces should be left
404    * outside the segments.
405    * 
406    * @param value
407    *              true if the leading white-spaces should be trimmed.
408    */
409   public void setTrimLeadingWhitespaces(boolean value) {
410     if (value != trimLeadingWS) {
411       trimLeadingWS = value;
412       modified = true;
413     }
414   }
415 
416   /**
417    * Indicates if trailing white-spaces should be left outside the segments.
418    * 
419    * @return true if the trailing white-spaces should be trimmed.
420    */
421   public boolean trimTrailingWhitespaces() {
422     return trimTrailingWS;
423   }
424 
425   /**
426    * Sets the indicator that tells if trailing white-spaces should be left
427    * outside the segments.
428    * 
429    * @param value
430    *              true if the trailing white-spaces should be trimmed.
431    */
432   public void setTrimTrailingWhitespaces(boolean value) {
433     if (value != trimTrailingWS) {
434       trimTrailingWS = value;
435       modified = true;
436     }
437   }
438 
439   /**
440    * Indicates if start codes should be included (See SRX implementation
441    * notes).
442    * 
443    * @return true if start codes should be included, false otherwise.
444    */
445   public boolean includeStartCodes() {
446     return includeStartCodes;
447   }
448 
449   /**
450    * Sets the indicator that tells if start codes should be included or not.
451    * (See SRX implementation notes).
452    * 
453    * @param value
454    *              true if start codes should be included, false otherwise.
455    */
456   public void setIncludeStartCodes(boolean value) {
457     if (value != includeStartCodes) {
458       includeStartCodes = value;
459       modified = true;
460     }
461   }
462 
463   /**
464    * Indicates if end codes should be included (See SRX implementation notes).
465    * 
466    * @return true if end codes should be included, false otherwise.
467    */
468   public boolean includeEndCodes() {
469     return includeEndCodes;
470   }
471 
472   /**
473    * Sets the indicator that tells if end codes should be included or not.
474    * (See SRX implementation notes).
475    * 
476    * @param value
477    *              true if end codes should be included, false otherwise.
478    */
479   public void setIncludeEndCodes(boolean value) {
480     if (value != includeEndCodes) {
481       includeEndCodes = value;
482       modified = true;
483     }
484   }
485 
486   /**
487    * Indicates if isolated codes should be included (See SRX implementation
488    * notes).
489    * 
490    * @return true if isolated codes should be included, false otherwise.
491    */
492   public boolean includeIsolatedCodes() {
493     return includeIsolatedCodes;
494   }
495 
496   /**
497    * Sets the indicator that tells if isolated codes should be included or
498    * not. (See SRX implementation notes).
499    * 
500    * @param value
501    *              true if isolated codes should be included, false otherwise.
502    */
503   public void setIncludeIsolatedCodes(boolean value) {
504     if (value != includeIsolatedCodes) {
505       includeIsolatedCodes = value;
506       modified = true;
507     }
508   }
509 
510   /**
511    * Gets the current pattern of the mask rule.
512    * 
513    * @return the current pattern of the mask rule.
514    */
515   public String getMaskRule() {
516     return maskRule;
517   }
518 
519   /**
520    * Sets the pattern for the mask rule.
521    * 
522    * @param pattern
523    *                the new pattern to use for the mask rule.
524    */
525   public void setMaskRule(String pattern) {
526     if (pattern != null) {
527       if (!pattern.equals(maskRule)) {
528         modified = true;
529       }
530     } else if (maskRule != null) {
531       modified = true;
532     }
533     maskRule = pattern;
534   }
535 
536   /**
537    * Gets the current sample text. This text is an example string that can be
538    * used to test the various rules. It can be handy to be able to save it
539    * along with the SRX document.
540    * 
541    * @return the sample text, or an empty string.
542    */
543   public String getSampleText() {
544     if (sampleText == null)
545       return "";
546     else
547       return sampleText;
548   }
549 
550   /**
551    * Sets the sample text.
552    * 
553    * @param value
554    *              the new sample text.
555    */
556   public void setSampleText(String value) {
557     if (value != null) {
558       if (!value.equals(sampleText)) {
559         modified = true;
560       }
561     } else if (sampleText != null) {
562       modified = true;
563     }
564     sampleText = value;
565   }
566 
567   /**
568    * Gets the current sample language code.
569    * 
570    * @return the current sample language code.
571    */
572   public String getSampleLanguage() {
573     return sampleLanguage;
574   }
575 
576   /**
577    * Sets the sample language code. Null or empty strings are changed to the
578    * default language.
579    * 
580    * @param value
581    *              the new sample language code.
582    */
583   public void setSampleLanguage(String value) {
584     if ((value == null) || (value.length() == 0)) {
585       sampleLanguage = "en";
586       modified = true;
587     } else {
588       if (!value.equals(sampleLanguage)) {
589         sampleLanguage = value;
590         modified = true;
591       }
592     }
593   }
594 
595   /**
596    * Indicates that, when sampling the rules, the sample should be computed
597    * using only a selected group of rules.
598    * 
599    * @return true to test using only a selected group of rules. False to test
600    *         using all the rules matching a given language.
601    */
602   public boolean testOnSelectedGroup() {
603     return testOnSelectedGroup;
604   }
605 
606   /**
607    * Sets the indicator on how to apply rules for samples.
608    * 
609    * @param value
610    *              true to test using only a selected group of rules. False to
611    *              test using all the rules matching a given language.
612    */
613   public void setTestOnSelectedGroup(boolean value) {
614     if (value != testOnSelectedGroup) {
615       testOnSelectedGroup = value;
616       modified = true;
617     }
618   }
619 
620   /**
621    * Indicates if the document has been modified since the last load or save.
622    * 
623    * @return true if the document have been modified, false otherwise.
624    */
625   public boolean isModified() {
626     return modified;
627   }
628 
629   /**
630    * Sets the flag indicating if the document has been modified since the last
631    * load or save. If you make change to the rules or language maps directly
632    * to the lists, make sure to set this flag to true.
633    * 
634    * @param value
635    *              true if the document has been changed, false otherwise.
636    */
637   public void setModified(boolean value) {
638     modified = value;
639   }
640 
641   /**
642    * Adds a language rule to this SRX document. If another language rule with
643    * the same name exists already it will be replaced by the new one, without
644    * warning.
645    * 
646    * @param name
647    *                 name of the language rule to add.
648    * @param langRule
649    *                 language rule object to add.
650    */
651   public void addLanguageRule(String name, ArrayList<Rule> langRule) {
652     langRules.put(name, langRule);
653     modified = true;
654   }
655 
656   /**
657    * Adds a language map to this document. The new map is added at the end of
658    * the one already there.
659    * 
660    * @param langMap
661    *                the language map object to add.
662    */
663   public void addLanguageMap(LanguageMap langMap) {
664     langMaps.add(langMap);
665     modified = true;
666   }
667 
668   /**
669    * Compiles the all language rules applicable for a given language code, and
670    * assign them to a segmenter. This method applies the language code you
671    * specify to the language mappings currently available in the document and
672    * compile the rules when one or more language map is found. The matching is
673    * done in the order of the list of language maps and more than one can be
674    * selected if {@link #cascade()} is true.
675    * 
676    * @param languageCode
677    *                          the language code. the value should be a BCP-47 value (e.g.
678    *                          "de", "fr-ca", etc.)
679    * @param existingSegmenter
680    *                          optional existing SRXSegmenter object to re-use. Use null for
681    *                          not re-using anything.
682    * @return the instance of the segmenter with the new compiled rules.
683    */
684   public ISegmenter compileLanguageRules(LocaleId languageCode, ISegmenter existingSegmenter) {
685     SRXSegmenter segmenter = null;
686     if ((existingSegmenter != null) && (existingSegmenter instanceof SRXSegmenter)) {
687       segmenter = (SRXSegmenter) existingSegmenter;
688     }
689 
690     if (segmenter != null) {
691       // Check if we really need to re-compile
692       if (languageCode != null) {
693         if (languageCode.equals(segmenter.getLanguage()) && (cascade == segmenter.cascade()))
694           return segmenter;
695       }
696       segmenter.reset();
697     } else {
698       segmenter = new SRXSegmenter();
699     }
700 
701     segmenter.setCascade(cascade);
702     segmenter.setOptions(segmentSubFlows, includeStartCodes, includeEndCodes, includeIsolatedCodes,
703         oneSegmentIncludesAll, trimLeadingWS, trimTrailingWS, useJavaRegex, useIcu4JBreakRules,
704         treatIsolatedCodesAsWhitespace);
705 
706     for (LanguageMap langMap : langMaps) {
707       if (Pattern.matches(langMap.pattern, languageCode.toString())) {
708         compileRules(segmenter, langMap.ruleName);
709         if (!segmenter.cascade())
710           break; // Stop at the first matching map
711       }
712     }
713 
714     segmenter.setLanguage(languageCode);
715     return segmenter;
716   }
717 
718   /**
719    * Compiles a single language rule group and assign it to a segmenter.
720    * 
721    * @param ruleName
722    *                          the name of the rule group to apply.
723    * @param existingSegmenter
724    *                          optional existing SRXSegmenter object to re-use. Use null for
725    *                          not re-using anything.
726    * @return the instance of the segmenter with the new compiled rules.
727    */
728   public ISegmenter compileSingleLanguageRule(String ruleName, ISegmenter existingSegmenter) {
729     SRXSegmenter segmenter = null;
730     if ((existingSegmenter != null) && (existingSegmenter instanceof SRXSegmenter)) {
731       segmenter = (SRXSegmenter) existingSegmenter;
732     }
733 
734     if (segmenter != null) {
735       // Check if we really need to re-compile
736       if (ruleName != null) {
737         if (segmenter.getLanguage().equals(LocaleId.EMPTY))
738           return segmenter;
739       }
740       segmenter.reset();
741     } else {
742       segmenter = new SRXSegmenter();
743     }
744 
745     segmenter.setOptions(segmentSubFlows, includeStartCodes, includeEndCodes, includeIsolatedCodes,
746         oneSegmentIncludesAll, trimLeadingWS, trimTrailingWS, useJavaRegex, useIcu4JBreakRules,
747         treatIsolatedCodesAsWhitespace);
748     compileRules(segmenter, ruleName);
749     segmenter.setLanguage(LocaleId.EMPTY);
750     return segmenter;
751   }
752 
753   /**
754    * Compiles a language rule into the current set of active rules.
755    * 
756    * @param ruleName
757    *                 the name of the language rule to compile.
758    */
759   private void compileRules(SRXSegmenter segmenter, String ruleName) {
760     if (!langRules.containsKey(ruleName) && !useIcu4JBreakRules) {
761       throw new SegmentationRuleException("language rule '" + ruleName + "' not found.");
762     }
763     ArrayList<Rule> langRule = langRules.get(ruleName);
764     String pattern = null;
765     for (Rule rule : langRule) {
766       if (rule.isActive) {
767         // Replace special markers ANYCODES by inline code pattern
768         pattern = generateRuleRegex(rule);
769         pattern = pattern.replace(ANYCODE, INLINECODE_PATTERN);
770 
771         // Compile and add the rule
772         segmenter.addRule(new CompiledRule(pattern, rule.isBreak));
773       }
774     }
775 
776     // Range rules
777     segmenter.setMaskRule(maskRule);
778   }
779 
780   public String generateRuleRegex(Rule rule) {
781     String pattern = "";
782     String beforePattern = "";
783     String afterPattern = "";
784 
785     afterPattern = "(" + rule.after + ")";
786     if (rule.before.endsWith(NOAUTO)) {
787       // If the rule.before ends with NOAUTO, then we do not put
788       // pattern for in-line codes
789       beforePattern = "(" + rule.before.substring(0, rule.before.length() - NOAUTO.length()) + ")";
790     } else {
791       // The compiled rule is made of two groups: the pattern before
792       // and the pattern after
793       // the break. A special pattern for in-line codes is also added
794       // transparently.
795       beforePattern = "(" + rule.before + AUTO_INLINECODES + ")";
796     }
797 
798     if (rule.before.isEmpty()) {
799       // must add empty group to maintain group count
800       pattern = "(.|\n)" + afterPattern;
801     } else if (rule.after.isEmpty()) {
802       // must add empty group to maintain group count
803       pattern = beforePattern + "()";
804     } else {
805       pattern = beforePattern + afterPattern;
806     }
807 
808     return pattern;
809 
810   }
811 
812   /**
813    * Loads an SRX document from a CharSequence object. Calling this method
814    * resets all settings and rules to their default state and then populate
815    * them with the data stored in the document being loaded. The rules can be
816    * embedded inside another vocabulary.
817    * 
818    * @param data
819    *             the string containing the SRX document to load.
820    */
821   public void loadRules(CharSequence data) {
822     loadRules(data, 1);
823     modified = true;
824   }
825 
826   /**
827    * Loads an SRX document from a file. Calling this method resets all
828    * settings and rules to their default state and then populate them with the
829    * data stored in the document being loaded. The rules can be embedded
830    * inside another vocabulary.
831    * 
832    * <p>
833    * For {@code SRXDocument.DEFAULT_SRX_RULES} (the string {@code "DEFAULT_SRX_RULES"} in serialized
834    * parameters)
835    * this will load the (Okapi recommended) {@code .srx} file, embedded in the library jar.
836    * </p>
837    * 
838    * @param pathOrURL
839    *                  The full path or URL of the document to load.
840    */
841   public void loadRules(String pathOrURL) {
842     if (DEFAULT_SRX_RULES.equals(pathOrURL)) {
843       loadRules(getClass().getResourceAsStream(DEFAULT_SRX_FILE));
844     } else {
845       loadRules(pathOrURL, 0);
846     }
847   }
848 
849   /**
850    * Loads an SRX document from an input stream. Calling this method resets
851    * all settings and rules to their default state and then populate them with
852    * the data stored in the document being loaded. The rules can be embedded
853    * inside another vocabulary.
854    * 
855    * @param inputStream
856    *                    the input stream to read from.
857    */
858   public void loadRules(InputStream inputStream) {
859     loadRules(inputStream, 2);
860   }
861 
862   private void loadRules(Object input, int inputType) {
863     try {
864       DocumentBuilderFactory Fact = DocumentBuilderFactory.newInstance();
865       Fact.setValidating(false);
866       Fact.setNamespaceAware(true);
867 
868       // Security configuration: Allow DOCTYPE but prevent XXE attacks
869       try {
870         // Allow DOCTYPE declarations (needed for some SRX files)
871         Fact.setFeature("http://apache.org/xml/features/disallow-doctype-decl", false);
872 
873         // Disable external entity resolution for security (prevents XXE attacks)
874         Fact.setFeature("http://xml.org/sax/features/external-general-entities", false);
875         Fact.setFeature("http://xml.org/sax/features/external-parameter-entities", false);
876         Fact.setFeature("http://apache.org/xml/features/nonvalidating/load-external-dtd", false);
877 
878         // Additional security hardening
879         Fact.setXIncludeAware(false);
880         Fact.setExpandEntityReferences(false);
881 
882       } catch (ParserConfigurationException e) {
883         LOGGER.warn("Unsupported XML feature on this platform, falling back safely.", e);
884       }
885 
886       DocumentBuilder docBuilder = Fact.newDocumentBuilder();
887       docBuilder.setEntityResolver(new DefaultEntityResolver());
888 
889       Document doc;
890 
891       if (inputType == 0) {
892         // Handle path or URL case with BOM detection
893         String pathOrURL = (String) input;
894         File srxFile = new File(Util.toURI(pathOrURL));
895         if (!srxFile.exists()) {
896           throw new OkapiException("SRX file not found");
897         }
898 
899         // Use BOM detector to clean stream before parsing
900         try (InputStream in = new java.io.FileInputStream(srxFile)) {
901           net.sf.okapi.common.BOMNewlineEncodingDetector detector = new net.sf.okapi.common.BOMNewlineEncodingDetector(
902               in, java.nio.charset.StandardCharsets.UTF_8);
903           detector.detectAndRemoveBom();
904           doc = docBuilder.parse(detector.getInputStream());
905         }
906 
907       } else if (inputType == 1) {
908         // String input (no BOM expected)
909         CharSequence data = (CharSequence) input;
910         doc = docBuilder.parse(new InputSource(new StringReader(data.toString())));
911 
912       } else {
913         // Stream input: remove BOM before parsing
914         InputStream inputStream = (InputStream) input;
915         net.sf.okapi.common.BOMNewlineEncodingDetector detector = new net.sf.okapi.common.BOMNewlineEncodingDetector(
916             inputStream, java.nio.charset.StandardCharsets.UTF_8);
917         detector.detectAndRemoveBom();
918         doc = docBuilder.parse(detector.getInputStream());
919       }
920 
921       resetAll();
922 
923       XPathFactory xpathFac = Util.createXPathFactory();
924       XPath xpath = xpathFac.newXPath();
925       NSContextManager nsContext = new NSContextManager();
926       nsContext.add("srx", NSURI_SRX20);
927       nsContext.add(NSPREFIX_OKPSRX, NSURI_OKPSRX);
928       nsContext.add("srx1", NSURI_SRX10);
929       xpath.setNamespaceContext(nsContext);
930 
931       // Try to get the root and detect if namespaces are used or not.
932       String ns = NSURI_SRX20;
933       XPathExpression xpe = xpath.compile("//srx:srx");
934       NodeList srxList = (NodeList) xpe.evaluate(doc, XPathConstants.NODESET);
935       if (srxList.getLength() < 1) {
936         xpe = xpath.compile("//srx1:srx");
937         srxList = (NodeList) xpe.evaluate(doc, XPathConstants.NODESET);
938         if (srxList.getLength() < 1) {
939           xpe = xpath.compile("//srx");
940           srxList = (NodeList) xpe.evaluate(doc, XPathConstants.NODESET);
941           if (srxList.getLength() < 1) {
942             return;
943           }
944           ns = "";
945         } else
946           ns = NSURI_SRX10;
947       }
948 
949       // Treat the first occurrence (we assume there is never more in one file)
950       Element srxElem = (Element) srxList.item(0);
951       docComment = getPreviousComments(srxElem, null);
952       String tmp = srxElem.getAttribute("version");
953       if (tmp.equals("1.0")) {
954         version = tmp;
955         warning = "SRX version 1.0 rules are subject to different interpretation.\nRead the help for more information.";
956       } else if (tmp.equals("2.0")) {
957         version = tmp;
958         warning = null;
959       } else
960         throw new OkapiIOException("Invalid version value.");
961 
962       Element elem1 = getFirstElementByTagNameNS(ns, "header", srxElem);
963       headerComment = getPreviousComments(elem1, null);
964 
965       tmp = elem1.getAttribute("segmentsubflows");
966       if (tmp.length() > 0)
967         segmentSubFlows = "yes".equals(tmp);
968       tmp = elem1.getAttribute("cascade");
969       if (tmp.length() > 0)
970         cascade = "yes".equals(tmp);
971 
972       // formathandle elements
973       NodeList list2 = elem1.getElementsByTagNameNS(ns, "formathandle");
974       for (int i = 0; i < list2.getLength(); i++) {
975         Element elem2 = (Element) list2.item(i);
976         tmp = elem2.getAttribute("type");
977         if ("start".equals(tmp)) {
978           tmp = elem2.getAttribute("include");
979           if (tmp.length() > 0)
980             includeStartCodes = "yes".equals(tmp);
981         } else if ("end".equals(tmp)) {
982           tmp = elem2.getAttribute("include");
983           if (tmp.length() > 0)
984             includeEndCodes = "yes".equals(tmp);
985         } else if ("isolated".equals(tmp)) {
986           tmp = elem2.getAttribute("include");
987           if (tmp.length() > 0)
988             includeIsolatedCodes = "yes".equals(tmp);
989         }
990       }
991 
992       // Extension: options
993       Element elem2 = getFirstElementByTagNameNS(NSURI_OKPSRX, "options", elem1);
994       if (elem2 != null) {
995         tmp = elem2.getAttribute("oneSegmentIncludesAll");
996         if (tmp.length() > 0)
997           oneSegmentIncludesAll = "yes".equals(tmp);
998 
999         tmp = elem2.getAttribute("trimLeadingWhitespaces");
1000         if (tmp.length() > 0)
1001           trimLeadingWS = "yes".equals(tmp);
1002 
1003         tmp = elem2.getAttribute("trimTrailingWhitespaces");
1004         if (tmp.length() > 0)
1005           trimTrailingWS = "yes".equals(tmp);
1006 
1007         tmp = elem2.getAttribute("useJavaRegex");
1008         if (tmp.length() > 0)
1009           useJavaRegex = true;
1010 
1011         tmp = elem2.getAttribute("useIcu4jBreakRules");
1012         if (tmp.length() > 0)
1013           useIcu4JBreakRules = "yes".equals(tmp);
1014 
1015         tmp = elem2.getAttribute("treatIsolatedCodesAsWhitespace");
1016         if (tmp.length() > 0)
1017           treatIsolatedCodesAsWhitespace = "yes".equals(tmp);
1018       }
1019 
1020       // Extension: sample
1021       elem2 = getFirstElementByTagNameNS(NSURI_OKPSRX, "sample", elem1);
1022       if (elem2 != null) {
1023         setSampleText(Util.getTextContent(elem2));
1024         tmp = elem2.getAttribute("language");
1025         if (tmp.length() > 0)
1026           setSampleLanguage(tmp);
1027         tmp = elem2.getAttribute("useMappedRules");
1028         if (tmp.length() > 0)
1029           setTestOnSelectedGroup("no".equals(tmp));
1030       }
1031 
1032       // Extension: rangeRule
1033       elem2 = getFirstElementByTagNameNS(NSURI_OKPSRX, "rangeRule", elem1);
1034       if (elem2 != null) {
1035         setMaskRule(Util.getTextContent(elem2));
1036       }
1037 
1038       // Get the body element
1039       elem1 = getFirstElementByTagNameNS(ns, "body", srxElem);
1040 
1041       // languagerules
1042       elem2 = getFirstElementByTagNameNS(ns, "languagerules", elem1);
1043       if (elem2 == null) {
1044         throw new OkapiException("the languagerules element is missing.");
1045       }
1046       // For each languageRule
1047       list2 = elem2.getElementsByTagNameNS(ns, "languagerule");
1048       for (int i = 0; i < list2.getLength(); i++) {
1049         Element elem3 = (Element) list2.item(i);
1050         ArrayList<Rule> tmpList = new ArrayList<>();
1051         String ruleName = elem3.getAttribute("languagerulename");
1052         // For each rule
1053         NodeList list3 = elem3.getElementsByTagNameNS(ns, "rule");
1054         for (int j = 0; j < list3.getLength(); j++) {
1055           Element elem4 = (Element) list3.item(j);
1056           Rule newRule = new Rule();
1057           newRule.comment = getPreviousComments(elem4, "rule");
1058           tmp = elem4.getAttribute("break");
1059           if (tmp.length() > 0)
1060             newRule.isBreak = "yes".equals(tmp);
1061           tmp = elem4.getAttributeNS(NSURI_OKPSRX, "active");
1062           if (tmp.length() > 0)
1063             newRule.isActive = "yes".equals(tmp);
1064           Element elem5 = getFirstElementByTagNameNS(ns, "beforebreak", elem4);
1065           if (elem5 != null)
1066             newRule.before = Util.getTextContent(elem5);
1067           elem5 = getFirstElementByTagNameNS(ns, "afterbreak", elem4);
1068           if (elem5 != null)
1069             newRule.after = Util.getTextContent(elem5);
1070           tmpList.add(newRule);
1071         }
1072         langRules.put(ruleName, tmpList);
1073       }
1074 
1075       // maprules
1076       elem2 = getFirstElementByTagNameNS(ns, "maprules", elem1);
1077       // For each languagemap
1078       list2 = elem2.getElementsByTagNameNS(ns, "languagemap");
1079       for (int i = 0; i < list2.getLength(); i++) {
1080         Element elem3 = (Element) list2.item(i);
1081         LanguageMap langMap = new LanguageMap();
1082         tmp = elem3.getAttribute("languagepattern");
1083         if (tmp.length() > 0)
1084           langMap.pattern = tmp;
1085         tmp = elem3.getAttribute("languagerulename");
1086         if (tmp.length() > 0)
1087           langMap.ruleName = tmp;
1088         langMaps.add(langMap);
1089       }
1090       modified = false;
1091     } catch (SAXException | XPathExpressionException | IOException
1092         | ParserConfigurationException e) {
1093       throw new OkapiIOException(e);
1094     }
1095   }
1096 
1097   /**
1098    * Gathers comments before a given element.
1099    * 
1100    * @param startNode
1101    *                    the node where to start. Use null to allow the gathering to go
1102    *                    at the parent level.
1103    * @param stopElement
1104    *                    the name of the node where to stop, or null for no limitation.
1105    * @return the string with all the comments found in the given scope, or
1106    *         null if no comments were found.
1107    */
1108   private String getPreviousComments(Node startNode, String stopElement) {
1109     Node node = startNode.getPreviousSibling();
1110     while (node != null) {
1111       switch (node.getNodeType()) {
1112       case Node.COMMENT_NODE:
1113         return node.getNodeValue();
1114       case Node.ELEMENT_NODE:
1115         if ((stopElement != null) && (node.getNodeName().equals(stopElement))) {
1116           return null;
1117         }
1118         break;
1119       }
1120       node = node.getPreviousSibling();
1121     }
1122     return null;
1123   }
1124   
1125   /**
1126    * Gets the first occurrence of a given element in a given namespace from a
1127    * given element.
1128    * 
1129    * @param ns
1130    *            the namespace URI to look for.
1131    * @param tagName
1132    *            the name of the element to look for.
1133    * @param elem
1134    *            the element where to look for.
1135    * @return the first found element, or null.
1136    */
1137   private Element getFirstElementByTagNameNS(String ns, String tagName, Element elem) {
1138     NodeList list = elem.getElementsByTagNameNS(ns, tagName);
1139     if ((list == null) || (list.getLength() < 1))
1140       return null;
1141     return (Element) list.item(0);
1142   }
1143 
1144   /**
1145    * Saves the current rules to an SRX string.
1146    * 
1147    * @param saveExtensions
1148    *                         true to save Okapi SRX extensions, false otherwise.
1149    * @param saveNonValidInfo
1150    *                         true to save non-SRX-valid attributes, false otherwise.
1151    * @return the string containing the saved SRX rules.
1152    */
1153   public String saveRulesToString(boolean saveExtensions, boolean saveNonValidInfo) {
1154     StringWriter strWriter = new StringWriter();
1155     XMLWriter writer = new XMLWriter(strWriter);
1156     boolean current = modified;
1157     saveRules(writer, saveExtensions, saveNonValidInfo);
1158     modified = current; // Keep the same state for modified
1159     writer.close();
1160     return strWriter.toString();
1161   }
1162 
1163   /**
1164    * Saves the current rules to an SRX rules document.
1165    * 
1166    * @param rulesPath
1167    *                         the full path of the file where to save the rules.
1168    * @param saveExtensions
1169    *                         true to save Okapi SRX extensions, false otherwise.
1170    * @param saveNonValidInfo
1171    *                         true to save non-SRX-valid attributes, false otherwise.
1172    */
1173   public void saveRules(String rulesPath, boolean saveExtensions, boolean saveNonValidInfo) {
1174     XMLWriter writer = new XMLWriter(rulesPath);
1175     saveRules(writer, saveExtensions, saveNonValidInfo);
1176   }
1177 
1178   private void saveRules(XMLWriter writer, boolean saveExtensions, boolean saveNonValidInfo) {
1179     try {
1180       writer.writeStartDocument();
1181       if (docComment != null) {
1182         writer.writeComment(docComment, true);
1183       }
1184       writer.writeStartElement("srx");
1185       writer.writeAttributeString("xmlns", NSURI_SRX20);
1186       if (saveExtensions) {
1187         writer.writeAttributeString("xmlns:" + NSPREFIX_OKPSRX, NSURI_OKPSRX);
1188       }
1189       writer.writeAttributeString("version", "2.0");
1190       version = "2.0";
1191       writer.writeLineBreak();
1192 
1193       if (headerComment != null) {
1194         writer.writeComment(headerComment, true);
1195       }
1196       writer.writeStartElement("header");
1197       writer.writeAttributeString("segmentsubflows", (segmentSubFlows ? "yes" : "no"));
1198       writer.writeAttributeString("cascade", (cascade ? "yes" : "no"));
1199       writer.writeLineBreak();
1200 
1201       writer.writeStartElement("formathandle");
1202       writer.writeAttributeString("type", "start");
1203       writer.writeAttributeString("include", (includeStartCodes ? "yes" : "no"));
1204       writer.writeEndElementLineBreak(); // formathandle
1205 
1206       writer.writeStartElement("formathandle");
1207       writer.writeAttributeString("type", "end");
1208       writer.writeAttributeString("include", (includeEndCodes ? "yes" : "no"));
1209       writer.writeEndElementLineBreak(); // formathandle
1210 
1211       writer.writeStartElement("formathandle");
1212       writer.writeAttributeString("type", "isolated");
1213       writer.writeAttributeString("include", (includeIsolatedCodes ? "yes" : "no"));
1214       writer.writeEndElementLineBreak(); // formathandle
1215 
1216       if (saveExtensions) {
1217         writer.writeStartElement(NSPREFIX_OKPSRX + ":options");
1218         writer.writeAttributeString("oneSegmentIncludesAll",
1219             (oneSegmentIncludesAll ? "yes" : "no"));
1220         writer.writeAttributeString("trimLeadingWhitespaces", (trimLeadingWS ? "yes" : "no"));
1221         writer.writeAttributeString("trimTrailingWhitespaces", (trimTrailingWS ? "yes" : "no"));
1222         writer.writeAttributeString("useJavaRegex", "yes");
1223         writer.writeAttributeString("useIcu4JBreakRules",
1224             (useIcu4JBreakRules ? "yes" : "no"));
1225         writer.writeAttributeString("treatIsolatedCodesAsWhitespace",
1226             (treatIsolatedCodesAsWhitespace ? "yes" : "no"));
1227         writer.writeEndElementLineBreak(); // okpsrx:options
1228 
1229         writer.writeStartElement(NSPREFIX_OKPSRX + ":sample");
1230         writer.writeAttributeString("language", getSampleLanguage());
1231         writer.writeAttributeString("useMappedRules", (testOnSelectedGroup() ? "no" : "yes"));
1232         writer.writeString(getSampleText());
1233         writer.writeEndElementLineBreak(); // okpsrx:sample
1234 
1235         writer.writeStartElement(NSPREFIX_OKPSRX + ":rangeRule");
1236         writer.writeString(getMaskRule());
1237         writer.writeEndElementLineBreak(); // okpsrx:rangeRule
1238       }
1239 
1240       writer.writeEndElementLineBreak(); // header
1241 
1242       writer.writeStartElement("body");
1243       writer.writeLineBreak();
1244 
1245       writer.writeStartElement("languagerules");
1246       writer.writeLineBreak();
1247       for (String ruleName : langRules.keySet()) {
1248         writer.writeStartElement("languagerule");
1249         writer.writeAttributeString("languagerulename", ruleName);
1250         writer.writeLineBreak();
1251         ArrayList<Rule> langRule = langRules.get(ruleName);
1252         for (Rule rule : langRule) {
1253           if (rule.comment != null) {
1254             writer.writeComment(rule.comment, true);
1255           }
1256           writer.writeStartElement("rule");
1257           writer.writeAttributeString("break", (rule.isBreak ? "yes" : "no"));
1258           // Start of non-standard SRX 2.0 (non-SRX attributes not
1259           // allowed)
1260           if (saveExtensions && saveNonValidInfo) {
1261             writer.writeAttributeString(NSPREFIX_OKPSRX + ":active",
1262                 (rule.isActive ? "yes" : "no"));
1263           }
1264           // End of non-Standard SRX
1265           writer.writeLineBreak();
1266           writer.writeElementString("beforebreak", rule.before);
1267           writer.writeLineBreak();
1268           writer.writeElementString("afterbreak", rule.after);
1269           writer.writeLineBreak();
1270           writer.writeEndElementLineBreak(); // rule
1271         }
1272         writer.writeEndElementLineBreak(); // languagerule
1273       }
1274       writer.writeEndElementLineBreak(); // languagerules
1275 
1276       writer.writeStartElement("maprules");
1277       writer.writeLineBreak();
1278       for (LanguageMap langMap : langMaps) {
1279         writer.writeStartElement("languagemap");
1280         writer.writeAttributeString("languagepattern", langMap.pattern);
1281         writer.writeAttributeString("languagerulename", langMap.ruleName);
1282         writer.writeEndElementLineBreak(); // languagemap
1283       }
1284       writer.writeEndElementLineBreak(); // maprules
1285 
1286       writer.writeEndElementLineBreak(); // body
1287 
1288       writer.writeEndElementLineBreak(); // srx
1289       writer.writeEndDocument();
1290       modified = false;
1291     } finally {
1292       if (writer != null)
1293         writer.close();
1294     }
1295   }
1296 
1297 }