1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19 package net.sf.okapi.steps.sentencealigner;
20
21 import java.util.Iterator;
22 import java.util.LinkedList;
23 import java.util.List;
24
25 import org.slf4j.Logger;
26 import org.slf4j.LoggerFactory;
27
28 import com.acumenvelocity.ath.common.OkapiUtil;
29
30 import net.sf.okapi.common.Event;
31 import net.sf.okapi.common.EventType;
32 import net.sf.okapi.common.IParameters;
33 import net.sf.okapi.common.ISegmenter;
34 import net.sf.okapi.common.StringUtil;
35 import net.sf.okapi.common.UsingParameters;
36 import net.sf.okapi.common.exceptions.OkapiException;
37 import net.sf.okapi.common.filters.IFilter;
38 import net.sf.okapi.common.filterwriter.TMXWriter;
39 import net.sf.okapi.common.observer.IObservable;
40 import net.sf.okapi.common.observer.IObserver;
41 import net.sf.okapi.common.pipeline.BasePipelineStep;
42 import net.sf.okapi.common.pipeline.IPipelineStep;
43 import net.sf.okapi.common.pipeline.annotations.StepParameterMapping;
44 import net.sf.okapi.common.pipeline.annotations.StepParameterType;
45 import net.sf.okapi.common.resource.AlignedPair;
46 import net.sf.okapi.common.resource.CodeMatchStrategy;
47 import net.sf.okapi.common.resource.IAlignedSegments;
48 import net.sf.okapi.common.resource.ITextUnit;
49 import net.sf.okapi.common.resource.PipelineParameters;
50 import net.sf.okapi.common.resource.RawDocument;
51 import net.sf.okapi.common.resource.Segment;
52 import net.sf.okapi.common.resource.TextFragmentUtil;
53 import net.sf.okapi.common.resource.TextPart;
54 import net.sf.okapi.common.resource.TextUnit;
55 import net.sf.okapi.common.resource.TextUnitUtil;
56 import net.sf.okapi.lib.segmentation.SRXDocument;
57 import net.sf.okapi.steps.gcaligner.AlignmentScorer;
58 import net.sf.okapi.steps.gcaligner.GaleAndChurch;
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74 @UsingParameters(Parameters.class)
75 public class SentenceAlignerStep extends BasePipelineStep implements IObserver {
76 private final Logger LOGGER = LoggerFactory.getLogger(getClass());
77
78 private Parameters params;
79 private TMXWriter tmx;
80 private IFilter targetFilter;
81 private RawDocument targetInput = null;
82 private SentenceAligner sentenceAligner;
83 private ISegmenter sourceSegmenter;
84 private ISegmenter targetSegmenter;
85
86 public SentenceAlignerStep(IFilter targetFilter) {
87 super();
88
89 this.targetFilter = targetFilter;
90 params = new Parameters();
91 List<AlignmentScorer<Segment>> scorerList = new LinkedList<>();
92 scorerList.add(new GaleAndChurch<>());
93 sentenceAligner = new SentenceAligner(scorerList);
94 }
95
96 @StepParameterMapping(parameterType = StepParameterType.SECOND_INPUT_RAWDOC)
97 public void setSecondInput(RawDocument secondInput) {
98 this.targetInput = secondInput;
99 }
100
101 @Override
102 public String getName() {
103 return "Sentence Alignment";
104 }
105
106 @Override
107 public String getDescription() {
108 return "Aligns sentences within text units (paragraphs). Produces sentence alignments as bilingual text units or a TMX file.";
109 }
110
111 @Override
112 public Parameters getParameters() {
113 return params;
114 }
115
116 @Override
117 public void setParameters(IParameters params) {
118 this.params = (Parameters) params;
119 }
120
121 @Override
122 protected Event handleStartBatch(Event event) {
123 boolean loadDefault = true;
124 SRXDocument srxDocument = new SRXDocument();
125
126
127 if (params.isSegmentSource()) {
128
129 if (params.isUseCustomSourceRules()) {
130 try {
131 srxDocument.loadRules(params.getCustomSourceRulesPath());
132 loadDefault = false;
133
134 } catch (Exception e) {
135 LOGGER.warn(
136 "Custom source segmentation rules file '{}' cannot be read.\nUsing the default rules instead.",
137 params.getCustomSourceRulesPath());
138 }
139 }
140
141 if (loadDefault) {
142 srxDocument.loadRules(SRXDocument.DEFAULT_SRX_RULES);
143 }
144
145
146
147 sourceSegmenter = srxDocument.compileLanguageRules(getSourceLocale(), null);
148 }
149
150
151 if (params.isSegmentTarget()) {
152 loadDefault = true;
153
154
155 if (params.isUseCustomTargetRules()) {
156 try {
157 srxDocument.loadRules(params.getCustomTargetRulesPath());
158 loadDefault = false;
159
160 } catch (Exception e) {
161 LOGGER.warn(
162 "Custom target segmentation rules file '{}' cannot be read.\nUsing the default rules instead.",
163 params.getCustomTargetRulesPath());
164 }
165 }
166
167 if (loadDefault) {
168 srxDocument.loadRules(SRXDocument.DEFAULT_SRX_RULES);
169 }
170
171
172
173 targetSegmenter = srxDocument.compileLanguageRules(getTargetLocale(), null);
174 }
175
176 return event;
177 }
178
179 protected Event handleEndBatch(Event event) {
180 if (tmx != null) {
181 tmx.writeEndDocument();
182 tmx.close();
183 tmx = null;
184 }
185
186 return event;
187 }
188
189 @Override
190 protected Event handleStartDocument(Event event) {
191 if (targetInput != null) {
192 initializeTargetFilter();
193 }
194
195
196 if (tmx == null && params.isGenerateTMX()) {
197 String mimeType = event.getStartDocument().getMimeType();
198 tmx = new TMXWriter(params.getTmxOutputPath());
199
200 tmx.writeStartDocument(getSourceLocale(), getTargetLocale(), getClass().getName(), null,
201 "sentence", null, mimeType);
202 }
203
204 return event;
205 }
206
207 @Override
208 protected Event handleEndDocument(Event event) {
209 if (targetFilter != null) {
210 targetFilter.close();
211 }
212
213 return event;
214 }
215
216 @Override
217 protected Event handleTextUnit(Event sourceEvent) {
218 ITextUnit sourceTu = sourceEvent.getTextUnit();
219 ITextUnit targetTu = null;
220
221
222 if (!sourceTu.isTranslatable() || sourceTu.isEmpty()) {
223 return sourceEvent;
224 }
225
226
227 if (targetInput != null) {
228 Event targetEvent = synchronize(EventType.TEXT_UNIT, sourceTu);
229 targetTu = targetEvent.getTextUnit();
230 }
231
232
233
234
235
236
237 if (params.isCollapseWhitespace()) {
238 for (TextPart p : sourceTu.getSource().getSegments()) {
239 p.text.setCodedText(StringUtil.collapseWhitespace(p.text.getCodedText()));
240 }
241
242 if (targetInput == null) {
243 for (TextPart p : sourceTu.getTarget(getTargetLocale()).getSegments()) {
244 p.text.setCodedText(StringUtil.collapseWhitespace(p.text.getCodedText()));
245 }
246
247 } else {
248 for (TextPart p : targetTu.getSource().getSegments()) {
249 p.text.setCodedText(StringUtil.collapseWhitespace(p.text.getCodedText()));
250 }
251 }
252 }
253
254
255 if (params.isSegmentSource()) {
256 sourceTu.createSourceSegmentation(sourceSegmenter);
257 }
258
259
260 if (params.isSegmentTarget()) {
261 if (targetTu == null) {
262
263 sourceTu.createTargetSegmentation(targetSegmenter, getTargetLocale());
264
265 } else {
266
267 targetTu.createSourceSegmentation(targetSegmenter);
268 }
269 }
270
271 ITextUnit alignedTextUnit;
272
273 if (params.isForceSimpleOneToOneAlignment()) {
274 alignedTextUnit = sourceTu;
275
276 if (targetInput == null) {
277
278 if (alignedTextUnit.getSourceSegments().count() != alignedTextUnit
279 .getTargetSegments(getTargetLocale()).count()) {
280
281 alignedTextUnit.getSource().joinAll();
282 alignedTextUnit.getTarget(getTargetLocale()).joinAll();
283 }
284
285 alignedTextUnit.getAlignedSegments().align(getTargetLocale());
286
287 } else {
288
289 if (alignedTextUnit.getSourceSegments().count() != targetTu.getSourceSegments().count()) {
290
291 alignedTextUnit.getSource().joinAll();
292 targetTu.getSource().joinAll();
293 }
294
295 List<AlignedPair> alignedPairs = new LinkedList<>();
296 Iterator<Segment> targetSegments = targetTu.getSourceSegments().iterator();
297
298 for (Segment sourceSegment : alignedTextUnit.getSourceSegments()) {
299 alignedPairs
300 .add(new AlignedPair(sourceSegment, targetSegments.next(), getTargetLocale()));
301 }
302
303 alignedTextUnit.getAlignedSegments().align(alignedPairs, getTargetLocale());
304 }
305
306 } else {
307 if (targetInput == null) {
308
309 alignedTextUnit = sentenceAligner.align(sourceTu, getSourceLocale(), getTargetLocale(),
310 params.isOutputOneTOneMatchesOnly());
311
312 } else {
313
314 alignedTextUnit = sentenceAligner.align(sourceTu, targetTu, getSourceLocale(),
315 getTargetLocale(),
316 params.isOutputOneTOneMatchesOnly());
317 }
318 }
319
320
321
322 TextUnitUtil.trimSegments(alignedTextUnit.getSource());
323 TextUnitUtil.trimSegments(alignedTextUnit.getTarget(getTargetLocale()));
324
325
326
327 IAlignedSegments segments = alignedTextUnit.getAlignedSegments();
328
329 for (Segment s : segments) {
330 Segment t = segments.getCorrespondingTarget(s, getTargetLocale());
331
332 if (t == null) {
333 continue;
334 }
335
336
337 TextFragmentUtil.alignAndCopyCodeMetadata(s.text, t.text, true, true,
338 CodeMatchStrategy.STRICT);
339 }
340
341 OkapiUtil.setAlOrigin(alignedTextUnit, getSourceLocale(), getTargetLocale());
342
343
344 if (params.isGenerateTMX()) {
345 tmx.writeTUFull(alignedTextUnit);
346 }
347
348
349 return new Event(EventType.TEXT_UNIT, alignedTextUnit);
350 }
351
352
353
354 @Override
355 protected Event handlePipelineParameters(Event sourceEvent) {
356
357 PipelineParameters pp = new PipelineParameters();
358 setSecondInput(pp.getSecondInputRawDocument());
359 return Event.createNoopEvent();
360 }
361
362
363
364
365
366
367
368
369
370
371
372
373
374 private void initializeTargetFilter() {
375 targetFilter.open(targetInput);
376 }
377
378 private Event synchronize(EventType untilType, ITextUnit sourceTu) {
379 boolean found = false;
380 Event event = null;
381
382 while (!found && targetFilter.hasNext()) {
383 event = targetFilter.next();
384
385 if (event.isTextUnit()) {
386 ITextUnit stu = event.getTextUnit();
387
388
389 if (!stu.isTranslatable() || stu.isEmpty()) {
390 continue;
391 }
392 }
393
394 found = (event.getEventType() == untilType);
395 }
396
397 if (!found) {
398 if (params.isGenerateTMX() && (tmx != null)) {
399 tmx.writeEndDocument();
400 tmx.close();
401 tmx = null;
402 }
403
404 String targetDoc = (targetInput == null) ? "null"
405 : targetInput.getInputURI() == null ? "null" : targetInput.getInputURI().toString();
406
407 throw new OkapiException(
408 "Different number of source or target TextUnits. " +
409 "The source and target documents are not paragraph aligned at:\n" +
410 "Source: " + sourceTu.getName() + " <> " + sourceTu.getSource().toString() +
411 "\nTarget Document: " + targetDoc);
412 }
413
414 return event;
415 }
416
417 @Override
418 public void update(IObservable o, Object event) {
419 }
420 }