1 package net.sf.okapi.steps.heuristicaligner;
2
3 import java.util.ArrayList;
4 import java.util.List;
5
6 import com.acumenvelocity.ath.common.OkapiUtil;
7 import com.acumenvelocity.ath.steps.BaseAlignerStep;
8
9 import net.sf.okapi.common.IParameters;
10 import net.sf.okapi.common.IResource;
11 import net.sf.okapi.common.UsingParameters;
12 import net.sf.okapi.common.filters.IFilter;
13 import net.sf.okapi.common.resource.AlignmentStatus;
14 import net.sf.okapi.common.resource.ITextUnit;
15 import net.sf.okapi.common.resource.Segment;
16 import net.sf.okapi.common.resource.TextContainer;
17 import net.sf.okapi.common.resource.TextFragment;
18 import net.sf.okapi.common.resource.TextPart;
19
20
21
22
23
24 @UsingParameters(HeuristicSentenceAlignerParameters.class)
25 public class HeuristicSentenceAlignerStep extends BaseAlignerStep {
26
27 private HeuristicSentenceAlignerParameters params;
28 private HeuristicAligner aligner;
29
30 public HeuristicSentenceAlignerStep(IFilter targetFilter) {
31 super(targetFilter);
32 params = new HeuristicSentenceAlignerParameters();
33 aligner = new HeuristicAligner();
34 }
35
36 @Override
37 public String getName() {
38 return "Improved Heuristic Sentence Alignment";
39 }
40
41 @Override
42 public String getDescription() {
43 return "Aligns paragraphs (M:N) and sentences using DP + translation-aware re-segmentation. "
44 + "Uses sophisticated heuristics with back-translation for both levels.";
45 }
46
47 @Override
48 public HeuristicSentenceAlignerParameters getParameters() {
49 return params;
50 }
51
52 @Override
53 public void setParameters(IParameters params) {
54 this.params = (HeuristicSentenceAlignerParameters) params;
55 }
56
57 @Override
58 protected boolean isSegmentSource() {
59 return params.isSegmentSource();
60 }
61
62 @Override
63 protected boolean isSegmentTarget() {
64 return params.isSegmentTarget();
65 }
66
67 @Override
68 protected boolean isUseCustomSourceRules() {
69 return params.isUseCustomSourceRules();
70 }
71
72 @Override
73 protected boolean isUseCustomTargetRules() {
74 return params.isUseCustomTargetRules();
75 }
76
77 @Override
78 protected String getCustomSourceRulesPath() {
79 return params.getCustomSourceRulesPath();
80 }
81
82 @Override
83 protected String getCustomTargetRulesPath() {
84 return params.getCustomTargetRulesPath();
85 }
86
87 @Override
88 protected boolean isCollapseWhitespace() {
89 return params.isCollapseWhitespace();
90 }
91
92 @Override
93 protected void performAlignment(List<ITextUnit> sourceTUs, List<ITextUnit> targetTUs) {
94 filterMtArtifacts(sourceTUs, targetTUs);
95
96 LOGGER.info("Alignment: {} source, {} target paragraphs", sourceTUs.size(), targetTUs.size());
97
98 aligner.setTranslationAwareResegmentation(true);
99
100 if (params.isForceSimpleOneToOneAlignment() && sourceTUs.size() == targetTUs.size()
101 && sourceTUs.size() > 0) {
102 LOGGER.info("Forcing 1:1 alignment");
103 performSimpleOneToOneAlignment(sourceTUs, targetTUs);
104
105 } else {
106
107 performManyToManyAlignment(sourceTUs, targetTUs);
108 }
109
110
111 for (ITextUnit tu : sourceTUs) {
112 OkapiUtil.setAlOrigin(tu, getSourceLocale(), getTargetLocale());
113 }
114 }
115
116
117
118
119 private void performSimpleOneToOneAlignment(List<ITextUnit> sourceTUs,
120 List<ITextUnit> targetTUs) {
121
122 for (int i = 0; i < sourceTUs.size(); i++) {
123 ITextUnit srcTu = sourceTUs.get(i);
124 ITextUnit trgTu = targetTUs.get(i);
125
126 alignSentencesInParagraphPair(srcTu, trgTu);
127
128 LOGGER.debug("1:1 aligned paragraph pair {}", i);
129 }
130 }
131
132
133
134
135
136
137 private void performManyToManyAlignment(List<ITextUnit> sourceTUs, List<ITextUnit> targetTUs) {
138 int m = sourceTUs.size();
139 int n = targetTUs.size();
140
141
142 double[][] simMatrix = new double[m][n];
143
144 for (int i = 0; i < m; i++) {
145 String srcText = getPlainText(sourceTUs.get(i));
146
147 for (int j = 0; j < n; j++) {
148 String trgText = getPlainText(targetTUs.get(j));
149 simMatrix[i][j] = aligner.calculateParagraphSimilarity(
150 srcText.toLowerCase().trim(),
151 trgText.toLowerCase().trim(),
152 getSourceLocale(),
153 getTargetLocale());
154 }
155 }
156
157
158 List<ParagraphMatch> matches = findParagraphMatches(simMatrix);
159
160
161 int totalConsumed = 0;
162 for (ParagraphMatch match : matches) {
163 List<ITextUnit> consumed = applyParagraphMatch(match, sourceTUs, targetTUs);
164 totalConsumed += consumed.size();
165 }
166
167 if (totalConsumed > 0) {
168 LOGGER.info("Marked {} source TUs as non-translatable (merged into other TUs)",
169 totalConsumed);
170 }
171 }
172
173
174
175
176
177 private List<ParagraphMatch> findParagraphMatches(double[][] simMatrix) {
178 List<ParagraphMatch> matches = new ArrayList<>();
179 int m = simMatrix.length;
180 int n = simMatrix[0].length;
181
182 boolean[] srcUsed = new boolean[m];
183 boolean[] trgUsed = new boolean[n];
184
185 int srcPtr = 0;
186 int trgPtr = 0;
187
188 while (srcPtr < m) {
189
190 ParagraphMatch bestMatch = null;
191 double bestScore = -1.0;
192
193 for (int srcWindow = 1; srcWindow <= Math.min(6, m - srcPtr); srcWindow++) {
194
195 boolean srcConflict = false;
196 for (int s = srcPtr; s < srcPtr + srcWindow; s++) {
197 if (srcUsed[s]) {
198 srcConflict = true;
199 break;
200 }
201 }
202 if (srcConflict)
203 break;
204
205
206 for (int trgWindow = 0; trgWindow <= Math.min(6, n - trgPtr); trgWindow++) {
207
208 boolean trgConflict = false;
209 for (int t = trgPtr; t < trgPtr + trgWindow; t++) {
210 if (trgUsed[t]) {
211 trgConflict = true;
212 break;
213 }
214 }
215 if (trgConflict)
216 break;
217
218
219 double score = calculateGroupSimilarity(
220 simMatrix, srcPtr, srcWindow, trgPtr, trgWindow);
221
222
223 double sizePenalty = Math.abs(srcWindow - trgWindow) * 0.05;
224 double adjustedScore = score - sizePenalty;
225
226 if (adjustedScore > bestScore) {
227 bestScore = score;
228 bestMatch = new ParagraphMatch(srcPtr, srcWindow, trgPtr, trgWindow, score);
229 }
230 }
231 }
232
233 if (bestMatch != null && (bestMatch.targetCount > 0
234 ? bestScore >= HeuristicAligner.PARAGRAPH_MATCH_THRESHOLD
235 : bestMatch.sourceCount <= 3)) {
236
237
238 for (int s = bestMatch.sourceStart; s < bestMatch.sourceStart
239 + bestMatch.sourceCount; s++) {
240 srcUsed[s] = true;
241 }
242 for (int t = bestMatch.targetStart; t < bestMatch.targetStart
243 + bestMatch.targetCount; t++) {
244 trgUsed[t] = true;
245 }
246
247 matches.add(bestMatch);
248 srcPtr = bestMatch.sourceStart + bestMatch.sourceCount;
249 trgPtr = bestMatch.targetStart + bestMatch.targetCount;
250
251 LOGGER.info("Paragraph match: s[{}..{}] -> t[{}..{}] (score: {:.3f})",
252 bestMatch.sourceStart, bestMatch.sourceStart + bestMatch.sourceCount - 1,
253 bestMatch.targetStart, bestMatch.targetStart + bestMatch.targetCount - 1,
254 bestScore);
255 } else {
256
257 srcPtr++;
258 }
259 }
260
261 return matches;
262 }
263
264
265
266
267 private double calculateGroupSimilarity(double[][] simMatrix,
268 int srcStart, int srcCount, int trgStart, int trgCount) {
269
270 if (srcCount == 0 || trgCount == 0) {
271 return 0.0;
272 }
273
274 double totalScore = 0.0;
275 int count = 0;
276
277 for (int i = srcStart; i < srcStart + srcCount; i++) {
278 for (int j = trgStart; j < trgStart + trgCount; j++) {
279 totalScore += simMatrix[i][j];
280 count++;
281 }
282 }
283
284 return count > 0 ? totalScore / count : 0.0;
285 }
286
287
288
289
290
291 private List<ITextUnit> applyParagraphMatch(ParagraphMatch match, List<ITextUnit> sourceTUs,
292 List<ITextUnit> targetTUs) {
293
294 List<ITextUnit> consumedTUs = new ArrayList<>();
295
296
297 List<ITextUnit> srcGroup = new ArrayList<>();
298 for (int i = match.sourceStart; i < match.sourceStart + match.sourceCount; i++) {
299 srcGroup.add(sourceTUs.get(i));
300 }
301
302
303 List<ITextUnit> trgGroup = new ArrayList<>();
304 for (int i = match.targetStart; i < match.targetStart + match.targetCount; i++) {
305 trgGroup.add(targetTUs.get(i));
306 }
307
308 if (srcGroup.isEmpty()) {
309 return consumedTUs;
310 }
311
312
313 if (srcGroup.size() == trgGroup.size()) {
314 for (int i = 0; i < srcGroup.size(); i++) {
315 ITextUnit srcTu = srcGroup.get(i);
316 ITextUnit trgTu = trgGroup.get(i);
317
318 alignSentencesInParagraphPair(srcTu, trgTu);
319 }
320
321 return consumedTUs;
322 }
323
324
325 ITextUnit primarySrc = srcGroup.get(0);
326
327
328 for (int i = 1; i < srcGroup.size(); i++) {
329 ITextUnit additionalSrc = srcGroup.get(i);
330
331 LOGGER.debug("Merging source TU {} into {}",
332 additionalSrc.getId(), primarySrc.getId());
333
334
335 for (Segment seg : additionalSrc.getSource().getSegments()) {
336 primarySrc.getSource().append(seg.clone());
337 }
338
339
340 additionalSrc.setIsTranslatable(false);
341 additionalSrc.getSource().clear();
342
343 consumedTUs.add(additionalSrc);
344 }
345
346 if (trgGroup.isEmpty()) {
347
348 LOGGER.debug("Creating empty target for source group starting at {}", match.sourceStart);
349 createEmptyTargetSegments(primarySrc);
350
351 } else {
352
353 ITextUnit mergedTarget = trgGroup.get(0).clone();
354
355 for (int i = 1; i < trgGroup.size(); i++) {
356 ITextUnit additionalTrg = trgGroup.get(i);
357
358 for (Segment seg : additionalTrg.getSource().getSegments()) {
359 mergedTarget.getSource().append(seg.clone());
360 }
361 }
362
363
364 alignSentencesInParagraphPair(primarySrc, mergedTarget);
365 }
366
367 return consumedTUs;
368 }
369
370
371
372
373 private void alignSentencesInParagraphPair(ITextUnit sourceTu, ITextUnit targetTu) {
374
375 List<Segment> sourceSegments = new ArrayList<>(sourceTu.getSource().getSegments().asList());
376 List<Segment> targetSegments = new ArrayList<>(targetTu.getSource().getSegments().asList());
377
378 if (sourceSegments.isEmpty()) {
379 LOGGER.warn("Source has no segments in paragraph pair");
380 return;
381 }
382
383 if (targetSegments.isEmpty()) {
384 LOGGER.warn("Target has no segments in paragraph pair");
385 createEmptyTargetSegments(sourceTu);
386 return;
387 }
388
389
390 TextContainer tempTarget = new TextContainer();
391
392 for (Segment seg : targetSegments) {
393 tempTarget.getSegments().append(seg.clone());
394 }
395
396 sourceTu.setTarget(getTargetLocale(), tempTarget);
397
398
399 List<SentenceMatch> matches = aligner.alignSentencesInTu(
400 sourceTu, getSourceLocale(), getTargetLocale());
401
402
403 TextContainer finalTarget = sourceTu.createTarget(getTargetLocale(), false,
404 IResource.CREATE_EMPTY);
405
406 finalTarget.clear();
407
408
409 for (SentenceMatch m : matches) {
410 if (m.sourceIndex >= 0 && m.sourceIndex < sourceSegments.size()) {
411 Segment srcSeg = sourceSegments.get(m.sourceIndex);
412
413 if (m.targetIndex >= 0 && m.targetIndex < targetSegments.size()) {
414 Segment trgSeg = targetSegments.get(m.targetIndex);
415 finalTarget.append(new Segment(srcSeg.getId(), trgSeg.text.clone()));
416
417 } else {
418
419 if (sourceSegments.size() == 1 || targetSegments.size() == 1) {
420
421 finalTarget = targetTu.getSource();
422 sourceTu.setTarget(getTargetLocale(), finalTarget);
423
424 if (targetSegments.size() == 1) {
425 sourceTu.getSource().joinAll();
426 }
427
428 if (sourceSegments.size() == 1) {
429 finalTarget.joinAll();
430 }
431
432 break;
433
434 } else {
435
436 finalTarget.append(new Segment(srcSeg.getId(), new TextFragment("???")));
437 }
438 }
439 }
440 }
441
442 finalTarget.setHasBeenSegmentedFlag(true);
443 finalTarget.getSegments().setAlignmentStatus(AlignmentStatus.ALIGNED);
444 }
445
446
447
448
449 private void createEmptyTargetSegments(ITextUnit tu) {
450 tu.createTarget(getTargetLocale(), false, IResource.CREATE_EMPTY);
451 for (Segment s : tu.getSource().getSegments()) {
452 tu.getTarget(getTargetLocale()).append(new Segment(s.getId(), new TextFragment("")));
453 }
454 }
455
456
457
458
459
460
461 private void filterMtArtifacts(List<ITextUnit> sourceTUs, List<ITextUnit> targetTUs) {
462
463 List<ITextUnit> filtered = new ArrayList<>();
464 for (ITextUnit tu : targetTUs) {
465 String text = tu.getSource().toString().toLowerCase().trim();
466
467
468 if (text.matches(".*\\bmachine\\b.*translated.*|.*google.*|.*auto.*translated.*")) {
469 LOGGER.info("Filtered MT artifact: {}", text);
470 continue;
471 }
472
473
474 filtered.add(tu);
475 }
476
477 int removedCount = targetTUs.size() - filtered.size();
478
479 if (removedCount > 0) {
480 LOGGER.info("Filtered {} MT artifact paragraphs from target", removedCount);
481 }
482
483 targetTUs.clear();
484 targetTUs.addAll(filtered);
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524 }
525
526
527
528
529 private String getPlainText(ITextUnit tu) {
530 StringBuilder sb = new StringBuilder();
531 for (TextPart tp : tu.getSource().getParts()) {
532 if (sb.length() > 0) {
533 sb.append(" ");
534 }
535 sb.append(tp.getContent().getText());
536 }
537 return sb.toString().trim();
538 }
539
540
541
542
543 static class ParagraphMatch {
544 int sourceStart;
545 int sourceCount;
546 int targetStart;
547 int targetCount;
548 double score;
549
550 ParagraphMatch(int ss, int sc, int ts, int tc, double score) {
551 this.sourceStart = ss;
552 this.sourceCount = sc;
553 this.targetStart = ts;
554 this.targetCount = tc;
555 this.score = score;
556 }
557 }
558
559
560
561
562 static class SentenceMatch {
563 public int sourceIndex;
564 public int targetIndex = -1;
565 public double score;
566
567 public SentenceMatch(int s, int t, double sc) {
568 this.sourceIndex = s;
569 this.targetIndex = t;
570 this.score = sc;
571 }
572 }
573 }