View Javadoc
1   /*
2    * ===========================================================================
3    * Copyright (C) 2009-2025 by the Okapi Framework contributors
4    * -----------------------------------------------------------------------------
5    * Licensed under the Apache License, Version 2.0 (the "License");
6    * you may not use this file except in compliance with the License.
7    * You may obtain a copy of the License at
8    * 
9    * http://www.apache.org/licenses/LICENSE-2.0
10   * 
11   * Unless required by applicable law or agreed to in writing, software
12   * distributed under the License is distributed on an "AS IS" BASIS,
13   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14   * See the License for the specific language governing permissions and
15   * limitations under the License.
16   * ===========================================================================
17   */
18  
19  package net.sf.okapi.steps.sentencealigner;
20  
21  import java.util.Iterator;
22  import java.util.LinkedList;
23  import java.util.List;
24  
25  import org.slf4j.Logger;
26  import org.slf4j.LoggerFactory;
27  
28  import com.acumenvelocity.ath.common.OkapiUtil;
29  
30  import net.sf.okapi.common.Event;
31  import net.sf.okapi.common.EventType;
32  import net.sf.okapi.common.IParameters;
33  import net.sf.okapi.common.ISegmenter;
34  import net.sf.okapi.common.StringUtil;
35  import net.sf.okapi.common.UsingParameters;
36  import net.sf.okapi.common.exceptions.OkapiException;
37  import net.sf.okapi.common.filters.IFilter;
38  import net.sf.okapi.common.filterwriter.TMXWriter;
39  import net.sf.okapi.common.observer.IObservable;
40  import net.sf.okapi.common.observer.IObserver;
41  import net.sf.okapi.common.pipeline.BasePipelineStep;
42  import net.sf.okapi.common.pipeline.IPipelineStep;
43  import net.sf.okapi.common.pipeline.annotations.StepParameterMapping;
44  import net.sf.okapi.common.pipeline.annotations.StepParameterType;
45  import net.sf.okapi.common.resource.AlignedPair;
46  import net.sf.okapi.common.resource.CodeMatchStrategy;
47  import net.sf.okapi.common.resource.IAlignedSegments;
48  import net.sf.okapi.common.resource.ITextUnit;
49  import net.sf.okapi.common.resource.PipelineParameters;
50  import net.sf.okapi.common.resource.RawDocument;
51  import net.sf.okapi.common.resource.Segment;
52  import net.sf.okapi.common.resource.TextFragmentUtil;
53  import net.sf.okapi.common.resource.TextPart;
54  import net.sf.okapi.common.resource.TextUnit;
55  import net.sf.okapi.common.resource.TextUnitUtil;
56  import net.sf.okapi.lib.segmentation.SRXDocument;
57  import net.sf.okapi.steps.gcaligner.AlignmentScorer;
58  import net.sf.okapi.steps.gcaligner.GaleAndChurch;
59  
60  /**
61   * Align sentences between source and target paragraphs (TextUnits) and produce a TMX file with
62   * aligned sentences. This
63   * {@link IPipelineStep} (via configuration) can also output aligned (multilingual
64   * {@link TextUnit}s)
65   * 
66   * @author HARGRAVEJE
67   * @version 1.47.0
68   * 
69   *          SV: Removed unneeded deprecation
70   *          SV: Fixed NPE in
71   *          TextFragmentUtil.alignAndCopyCodeMetadata(s.text, t.text, true, true,
72   *          CodeMatchStrategy.STRICT);
73   */
74  @UsingParameters(Parameters.class)
75  public class SentenceAlignerStep extends BasePipelineStep implements IObserver {
76    private final Logger LOGGER = LoggerFactory.getLogger(getClass());
77  
78    private Parameters params;
79    private TMXWriter tmx;
80    private IFilter targetFilter;
81    private RawDocument targetInput = null;
82    private SentenceAligner sentenceAligner;
83    private ISegmenter sourceSegmenter;
84    private ISegmenter targetSegmenter;
85  
86    public SentenceAlignerStep(IFilter targetFilter) {
87      super();
88  
89      this.targetFilter = targetFilter;
90      params = new Parameters();
91      List<AlignmentScorer<Segment>> scorerList = new LinkedList<>();
92      scorerList.add(new GaleAndChurch<>());
93      sentenceAligner = new SentenceAligner(scorerList);
94    }
95  
96    @StepParameterMapping(parameterType = StepParameterType.SECOND_INPUT_RAWDOC)
97    public void setSecondInput(RawDocument secondInput) {
98      this.targetInput = secondInput;
99    }
100 
101   @Override
102   public String getName() {
103     return "Sentence Alignment";
104   }
105 
106   @Override
107   public String getDescription() {
108     return "Aligns sentences within text units (paragraphs). Produces sentence alignments as bilingual text units or a TMX file.";
109   }
110 
111   @Override
112   public Parameters getParameters() {
113     return params;
114   }
115 
116   @Override
117   public void setParameters(IParameters params) {
118     this.params = (Parameters) params;
119   }
120 
121   @Override
122   protected Event handleStartBatch(Event event) {
123     boolean loadDefault = true;
124     SRXDocument srxDocument = new SRXDocument();
125 
126     // Prepare source segmentation if needed
127     if (params.isSegmentSource()) {
128       // Load default or custom rules
129       if (params.isUseCustomSourceRules()) {
130         try {
131           srxDocument.loadRules(params.getCustomSourceRulesPath());
132           loadDefault = false;
133 
134         } catch (Exception e) {
135           LOGGER.warn(
136               "Custom source segmentation rules file '{}' cannot be read.\nUsing the default rules instead.",
137               params.getCustomSourceRulesPath());
138         }
139       }
140 
141       if (loadDefault) {
142         srxDocument.loadRules(SRXDocument.DEFAULT_SRX_RULES);
143       }
144 
145       // TODO: decide how we deal with leading/trailing spaces
146       // srxDocument.setTrimLeadingWhitespaces(false);
147       sourceSegmenter = srxDocument.compileLanguageRules(getSourceLocale(), null);
148     }
149 
150     // Prepare target segmentation if needed
151     if (params.isSegmentTarget()) {
152       loadDefault = true;
153 
154       // Load default or custom rules
155       if (params.isUseCustomTargetRules()) {
156         try {
157           srxDocument.loadRules(params.getCustomTargetRulesPath());
158           loadDefault = false;
159 
160         } catch (Exception e) {
161           LOGGER.warn(
162               "Custom target segmentation rules file '{}' cannot be read.\nUsing the default rules instead.",
163               params.getCustomTargetRulesPath());
164         }
165       }
166 
167       if (loadDefault) {
168         srxDocument.loadRules(SRXDocument.DEFAULT_SRX_RULES);
169       }
170 
171       // TODO: decide how we deal with leading/trailing spaces
172       // srxDocument.setTrimLeadingWhitespaces(false);
173       targetSegmenter = srxDocument.compileLanguageRules(getTargetLocale(), null);
174     }
175 
176     return event;
177   }
178 
179   protected Event handleEndBatch(Event event) {
180     if (tmx != null) {
181       tmx.writeEndDocument();
182       tmx.close();
183       tmx = null;
184     }
185 
186     return event;
187   }
188 
189   @Override
190   protected Event handleStartDocument(Event event) {
191     if (targetInput != null) {
192       initializeTargetFilter();
193     }
194 
195     // Start TMX writer (one for all input documents)
196     if (tmx == null && params.isGenerateTMX()) {
197       String mimeType = event.getStartDocument().getMimeType();
198       tmx = new TMXWriter(params.getTmxOutputPath());
199 
200       tmx.writeStartDocument(getSourceLocale(), getTargetLocale(), getClass().getName(), null,
201           "sentence", null, mimeType);
202     }
203 
204     return event;
205   }
206 
207   @Override
208   protected Event handleEndDocument(Event event) {
209     if (targetFilter != null) {
210       targetFilter.close();
211     }
212 
213     return event;
214   }
215 
216   @Override
217   protected Event handleTextUnit(Event sourceEvent) {
218     ITextUnit sourceTu = sourceEvent.getTextUnit();
219     ITextUnit targetTu = null;
220 
221     // Skip non-translatable and empty
222     if (!sourceTu.isTranslatable() || sourceTu.isEmpty()) {
223       return sourceEvent;
224     }
225 
226     // Move to the next target TU
227     if (targetInput != null) {
228       Event targetEvent = synchronize(EventType.TEXT_UNIT, sourceTu);
229       targetTu = targetEvent.getTextUnit();
230     }
231 
232     // TextUnitUtil.removeCodes(sourceTu.getSource()); // @@@
233     // TextUnitUtil.removeCodes(targetTu.getSource()); // @@@
234 
235     // collapse whitespace if needed *before* segmentation and alignment
236     // FIXME: When we get parallel pipelines we should move this to a step!!!
237     if (params.isCollapseWhitespace()) {
238       for (TextPart p : sourceTu.getSource().getSegments()) {
239         p.text.setCodedText(StringUtil.collapseWhitespace(p.text.getCodedText()));
240       }
241 
242       if (targetInput == null) {
243         for (TextPart p : sourceTu.getTarget(getTargetLocale()).getSegments()) {
244           p.text.setCodedText(StringUtil.collapseWhitespace(p.text.getCodedText()));
245         }
246 
247       } else {
248         for (TextPart p : targetTu.getSource().getSegments()) {
249           p.text.setCodedText(StringUtil.collapseWhitespace(p.text.getCodedText()));
250         }
251       }
252     }
253 
254     // Segment the source if requested
255     if (params.isSegmentSource()) {
256       sourceTu.createSourceSegmentation(sourceSegmenter);
257     }
258 
259     // Segment the target if requested
260     if (params.isSegmentTarget()) {
261       if (targetTu == null) {
262         // TextUnit is bilingual
263         sourceTu.createTargetSegmentation(targetSegmenter, getTargetLocale());
264 
265       } else {
266         // Separate target TextUnit
267         targetTu.createSourceSegmentation(targetSegmenter);
268       }
269     }
270 
271     ITextUnit alignedTextUnit;
272 
273     if (params.isForceSimpleOneToOneAlignment()) {
274       alignedTextUnit = sourceTu;
275 
276       if (targetInput == null) {
277         // bilingual case
278         if (alignedTextUnit.getSourceSegments().count() != alignedTextUnit
279             .getTargetSegments(getTargetLocale()).count()) {
280           // collapse sentences
281           alignedTextUnit.getSource().joinAll();
282           alignedTextUnit.getTarget(getTargetLocale()).joinAll();
283         }
284 
285         alignedTextUnit.getAlignedSegments().align(getTargetLocale());
286 
287       } else {
288         // monolingual case where we have separate source and target TextUnits
289         if (alignedTextUnit.getSourceSegments().count() != targetTu.getSourceSegments().count()) {
290           // collapse sentences
291           alignedTextUnit.getSource().joinAll();
292           targetTu.getSource().joinAll();
293         }
294 
295         List<AlignedPair> alignedPairs = new LinkedList<>();
296         Iterator<Segment> targetSegments = targetTu.getSourceSegments().iterator();
297 
298         for (Segment sourceSegment : alignedTextUnit.getSourceSegments()) {
299           alignedPairs
300               .add(new AlignedPair(sourceSegment, targetSegments.next(), getTargetLocale()));
301         }
302 
303         alignedTextUnit.getAlignedSegments().align(alignedPairs, getTargetLocale());
304       }
305 
306     } else {
307       if (targetInput == null) {
308         // case where the TextUnit is already bilingual
309         alignedTextUnit = sentenceAligner.align(sourceTu, getSourceLocale(), getTargetLocale(),
310             params.isOutputOneTOneMatchesOnly());
311 
312       } else {
313         // case where we have separate source and target TextUnits
314         alignedTextUnit = sentenceAligner.align(sourceTu, targetTu, getSourceLocale(),
315             getTargetLocale(),
316             params.isOutputOneTOneMatchesOnly());
317       }
318     }
319 
320     // remove leading and trailing whitespace in the aligned TextUnit
321     // for both source and target
322     TextUnitUtil.trimSegments(alignedTextUnit.getSource());
323     TextUnitUtil.trimSegments(alignedTextUnit.getTarget(getTargetLocale()));
324 
325     // align codes between source and target and
326     // copy source code data to corresponding target codes
327     IAlignedSegments segments = alignedTextUnit.getAlignedSegments();
328 
329     for (Segment s : segments) {
330       Segment t = segments.getCorrespondingTarget(s, getTargetLocale());
331 
332       if (t == null) {
333         continue;
334       }
335 
336       // Copy codes from source so that leveraged target matches the source
337       TextFragmentUtil.alignAndCopyCodeMetadata(s.text, t.text, true, true,
338           CodeMatchStrategy.STRICT);
339     }
340 
341     OkapiUtil.setAlOrigin(alignedTextUnit, getSourceLocale(), getTargetLocale());
342 
343     // Send the aligned TU to the TMX file
344     if (params.isGenerateTMX()) {
345       tmx.writeTUFull(alignedTextUnit);
346     }
347 
348     // pass on the aligned (possibly partially aligned)
349     return new Event(EventType.TEXT_UNIT, alignedTextUnit);
350   }
351 
352   // this allows another step (such as the ParagraphAlignmentStep) to consume the target input to
353   // add target data to the source Text Unit
354   @Override
355   protected Event handlePipelineParameters(Event sourceEvent) {
356     // FIXME: Why not simply setSecondInput to Null?
357     PipelineParameters pp = new PipelineParameters();
358     setSecondInput(pp.getSecondInputRawDocument());
359     return Event.createNoopEvent();
360   }
361 
362   // private void initializeFilter() {
363   // // Initialize the filter to read the translation to compare
364   // filter = fcMapper.createFilter(targetInput.getFilterConfigId(), null);
365   //
366   // // Open the second input for this batch item
367   // filter.open(targetInput);
368   //
369   // if (writer != null) {
370   // writer.close();
371   // }
372   // }
373 
374   private void initializeTargetFilter() {
375     targetFilter.open(targetInput);
376   }
377 
378   private Event synchronize(EventType untilType, ITextUnit sourceTu) {
379     boolean found = false;
380     Event event = null;
381 
382     while (!found && targetFilter.hasNext()) {
383       event = targetFilter.next();
384 
385       if (event.isTextUnit()) {
386         ITextUnit stu = event.getTextUnit();
387 
388         // Skip non-translatable and empty just like our primary filter
389         if (!stu.isTranslatable() || stu.isEmpty()) {
390           continue;
391         }
392       }
393 
394       found = (event.getEventType() == untilType);
395     }
396 
397     if (!found) {
398       if (params.isGenerateTMX() && (tmx != null)) {
399         tmx.writeEndDocument();
400         tmx.close();
401         tmx = null;
402       }
403 
404       String targetDoc = (targetInput == null) ? "null"
405           : targetInput.getInputURI() == null ? "null" : targetInput.getInputURI().toString();
406 
407       throw new OkapiException(
408           "Different number of source or target TextUnits. " +
409               "The source and target documents are not paragraph aligned at:\n" +
410               "Source: " + sourceTu.getName() + " <> " + sourceTu.getSource().toString() +
411               "\nTarget Document: " + targetDoc);
412     }
413 
414     return event;
415   }
416 
417   @Override
418   public void update(IObservable o, Object event) {
419   }
420 }