View Javadoc
1   package com.acumenvelocity.ath.steps;
2   
3   import java.util.ArrayList;
4   import java.util.List;
5   
6   import org.slf4j.Logger;
7   import org.slf4j.LoggerFactory;
8   
9   import net.sf.okapi.common.Event;
10  import net.sf.okapi.common.IParameters;
11  import net.sf.okapi.common.ISegmenter;
12  import net.sf.okapi.common.StringUtil;
13  import net.sf.okapi.common.filters.IFilter;
14  import net.sf.okapi.common.pipeline.annotations.StepParameterMapping;
15  import net.sf.okapi.common.pipeline.annotations.StepParameterType;
16  import net.sf.okapi.common.resource.ITextUnit;
17  import net.sf.okapi.common.resource.RawDocument;
18  import net.sf.okapi.common.resource.TextPart;
19  import net.sf.okapi.lib.segmentation.SRXDocument;
20  
21  /**
22   * Base class for sentence alignment steps that align source and target documents
23   * at both paragraph and sentence levels.
24   */
25  public abstract class BaseAlignerStep extends BaseTuBatchProcessingStep {
26  
27    protected final Logger LOGGER = LoggerFactory.getLogger(getClass());
28  
29    private IFilter targetFilter;
30    private RawDocument targetInput = null;
31    private ISegmenter sourceSegmenter;
32    private ISegmenter targetSegmenter;
33  
34    private final List<ITextUnit> sourceTUs = new ArrayList<>();
35    private final List<ITextUnit> targetTUs = new ArrayList<>();
36  
37    public BaseAlignerStep(IFilter targetFilter) {
38      super();
39      this.targetFilter = targetFilter;
40    }
41  
42    @StepParameterMapping(parameterType = StepParameterType.SECOND_INPUT_RAWDOC)
43    public void setSecondInput(RawDocument secondInput) {
44      this.targetInput = secondInput;
45    }
46  
47    /**
48     * Get the parameters object for this aligner.
49     * Must return a parameters class that implements the required parameter methods.
50     */
51    @Override
52    public abstract IParameters getParameters();
53  
54    /**
55     * Check if source segmentation is enabled.
56     */
57    protected abstract boolean isSegmentSource();
58  
59    /**
60     * Check if target segmentation is enabled.
61     */
62    protected abstract boolean isSegmentTarget();
63  
64    /**
65     * Check if custom source segmentation rules should be used.
66     */
67    protected abstract boolean isUseCustomSourceRules();
68  
69    /**
70     * Check if custom target segmentation rules should be used.
71     */
72    protected abstract boolean isUseCustomTargetRules();
73  
74    /**
75     * Get the path to custom source segmentation rules.
76     */
77    protected abstract String getCustomSourceRulesPath();
78  
79    /**
80     * Get the path to custom target segmentation rules.
81     */
82    protected abstract String getCustomTargetRulesPath();
83  
84    /**
85     * Check if whitespace should be collapsed.
86     */
87    protected abstract boolean isCollapseWhitespace();
88  
89    /**
90     * Perform the actual alignment logic.
91     * Called after all source and target TUs have been collected.
92     */
93    protected abstract void performAlignment(List<ITextUnit> sourceTUs, List<ITextUnit> targetTUs);
94  
95    @Override
96    protected Event handleStartBatch(Event event) {
97      boolean loadDefault = true;
98      SRXDocument srxDocument = new SRXDocument();
99  
100     // Prepare source segmentation if needed
101     if (isSegmentSource()) {
102       if (isUseCustomSourceRules()) {
103         try {
104           srxDocument.loadRules(getCustomSourceRulesPath());
105           loadDefault = false;
106 
107         } catch (Exception e) {
108           LOGGER.warn(
109               "Custom source segmentation rules file '{}' cannot be read. Using the default rules instead.",
110               getCustomSourceRulesPath());
111         }
112       }
113 
114       if (loadDefault) {
115         srxDocument.loadRules(SRXDocument.DEFAULT_SRX_RULES);
116       }
117 
118       sourceSegmenter = srxDocument.compileLanguageRules(getSourceLocale(), null);
119     }
120 
121     // Prepare target segmentation if needed
122     if (isSegmentTarget()) {
123       loadDefault = true;
124 
125       if (isUseCustomTargetRules()) {
126         try {
127           srxDocument.loadRules(getCustomTargetRulesPath());
128           loadDefault = false;
129 
130         } catch (Exception e) {
131           LOGGER.warn(
132               "Custom target segmentation rules file '{}' cannot be read. Using the default rules instead.",
133               getCustomTargetRulesPath());
134         }
135       }
136 
137       if (loadDefault) {
138         srxDocument.loadRules(SRXDocument.DEFAULT_SRX_RULES);
139       }
140 
141       targetSegmenter = srxDocument.compileLanguageRules(getTargetLocale(), null);
142     }
143 
144     return event;
145   }
146 
147   @Override
148   protected Event handleStartDocument(Event event) {
149     sourceTUs.clear();
150     targetTUs.clear();
151 
152     if (targetInput != null) {
153       initializeTargetFilter();
154     }
155 
156     return super.handleStartDocument(event);
157   }
158 
159   @Override
160   protected Event handleTextUnit(Event event) {
161     ITextUnit tu = event.getTextUnit();
162 
163     if (!tu.isTranslatable() || tu.isEmpty()) {
164       return Event.createNoopEvent();
165     }
166 
167     // Collapse whitespace if needed *before* segmentation
168     if (isCollapseWhitespace()) {
169       for (TextPart p : tu.getSource().getSegments()) {
170         p.text.setCodedText(StringUtil.collapseWhitespace(p.text.getCodedText()));
171       }
172     }
173 
174     // Segment the source if requested
175     if (isSegmentSource()) {
176       tu.createSourceSegmentation(sourceSegmenter);
177     }
178 
179     sourceTUs.add(tu);
180     return Event.createNoopEvent();
181   }
182 
183   /**
184    * Initialize the target filter with the target input document.
185    */
186   protected void initializeTargetFilter() {
187     if (targetFilter != null && targetInput != null) {
188       targetFilter.open(targetInput);
189     }
190   }
191 
192   /**
193    * Read all target text units from the target document.
194    * This is called automatically by processTuEvents.
195    */
196   protected void readAllTargetTUs() {
197     if (targetFilter == null) {
198       return;
199     }
200 
201     while (targetFilter.hasNext()) {
202       Event event = targetFilter.next();
203 
204       if (event.isTextUnit()) {
205         ITextUnit tu = event.getTextUnit();
206 
207         if (tu.isTranslatable() && !tu.isEmpty()) {
208           // Collapse whitespace if needed *before* segmentation
209           if (isCollapseWhitespace()) {
210             for (TextPart p : tu.getSource().getSegments()) {
211               p.text.setCodedText(StringUtil.collapseWhitespace(p.text.getCodedText()));
212             }
213           }
214 
215           // Segment the target if requested
216           if (isSegmentTarget()) {
217             tu.createSourceSegmentation(targetSegmenter);
218           }
219 
220           targetTUs.add(tu);
221         }
222       }
223     }
224   }
225 
226   @Override
227   protected void clear() {
228     sourceTUs.clear();
229     targetTUs.clear();
230 
231     if (targetFilter != null) {
232       targetFilter.close();
233     }
234   }
235 
236   @Override
237   protected void processTuEvents(List<Event> tuEvents) {
238     // Read all target TUs
239     readAllTargetTUs();
240 
241     // Perform the alignment (implemented by subclasses)
242     performAlignment(sourceTUs, targetTUs);
243   }
244 }