View Javadoc
1   package net.sf.okapi.steps.heuristicaligner;
2   
3   import java.util.ArrayList;
4   import java.util.List;
5   
6   import com.acumenvelocity.ath.common.OkapiUtil;
7   import com.acumenvelocity.ath.steps.BaseAlignerStep;
8   
9   import net.sf.okapi.common.IParameters;
10  import net.sf.okapi.common.IResource;
11  import net.sf.okapi.common.UsingParameters;
12  import net.sf.okapi.common.filters.IFilter;
13  import net.sf.okapi.common.resource.AlignmentStatus;
14  import net.sf.okapi.common.resource.ITextUnit;
15  import net.sf.okapi.common.resource.Segment;
16  import net.sf.okapi.common.resource.TextContainer;
17  import net.sf.okapi.common.resource.TextFragment;
18  import net.sf.okapi.common.resource.TextPart;
19  
20  /**
21   * Final, production-ready heuristic sentence aligner.
22   * Supports M:N paragraph matching with sophisticated heuristics.
23   */
24  @UsingParameters(HeuristicSentenceAlignerParameters.class)
25  public class HeuristicSentenceAlignerStep extends BaseAlignerStep {
26  
27    private HeuristicSentenceAlignerParameters params;
28    private HeuristicAligner aligner;
29  
30    public HeuristicSentenceAlignerStep(IFilter targetFilter) {
31      super(targetFilter);
32      params = new HeuristicSentenceAlignerParameters();
33      aligner = new HeuristicAligner();
34    }
35  
36    @Override
37    public String getName() {
38      return "Improved Heuristic Sentence Alignment";
39    }
40  
41    @Override
42    public String getDescription() {
43      return "Aligns paragraphs (M:N) and sentences using DP + translation-aware re-segmentation. "
44          + "Uses sophisticated heuristics with back-translation for both levels.";
45    }
46  
47    @Override
48    public HeuristicSentenceAlignerParameters getParameters() {
49      return params;
50    }
51  
52    @Override
53    public void setParameters(IParameters params) {
54      this.params = (HeuristicSentenceAlignerParameters) params;
55    }
56  
57    @Override
58    protected boolean isSegmentSource() {
59      return params.isSegmentSource();
60    }
61  
62    @Override
63    protected boolean isSegmentTarget() {
64      return params.isSegmentTarget();
65    }
66  
67    @Override
68    protected boolean isUseCustomSourceRules() {
69      return params.isUseCustomSourceRules();
70    }
71  
72    @Override
73    protected boolean isUseCustomTargetRules() {
74      return params.isUseCustomTargetRules();
75    }
76  
77    @Override
78    protected String getCustomSourceRulesPath() {
79      return params.getCustomSourceRulesPath();
80    }
81  
82    @Override
83    protected String getCustomTargetRulesPath() {
84      return params.getCustomTargetRulesPath();
85    }
86  
87    @Override
88    protected boolean isCollapseWhitespace() {
89      return params.isCollapseWhitespace();
90    }
91  
92    @Override
93    protected void performAlignment(List<ITextUnit> sourceTUs, List<ITextUnit> targetTUs) {
94      filterMtArtifacts(sourceTUs, targetTUs);
95  
96      LOGGER.info("Alignment: {} source, {} target paragraphs", sourceTUs.size(), targetTUs.size());
97  
98      aligner.setTranslationAwareResegmentation(true);
99  
100     if (params.isForceSimpleOneToOneAlignment() && sourceTUs.size() == targetTUs.size()
101         && sourceTUs.size() > 0) {
102       LOGGER.info("Forcing 1:1 alignment");
103       performSimpleOneToOneAlignment(sourceTUs, targetTUs);
104 
105     } else {
106       // Use M:N paragraph matching with DP
107       performManyToManyAlignment(sourceTUs, targetTUs);
108     }
109 
110     // Set alignment origin on all remaining source TUs
111     for (ITextUnit tu : sourceTUs) {
112       OkapiUtil.setAlOrigin(tu, getSourceLocale(), getTargetLocale());
113     }
114   }
115 
116   /**
117    * Simple 1:1 alignment when paragraph counts match
118    */
119   private void performSimpleOneToOneAlignment(List<ITextUnit> sourceTUs,
120       List<ITextUnit> targetTUs) {
121 
122     for (int i = 0; i < sourceTUs.size(); i++) {
123       ITextUnit srcTu = sourceTUs.get(i);
124       ITextUnit trgTu = targetTUs.get(i);
125 
126       alignSentencesInParagraphPair(srcTu, trgTu);
127 
128       LOGGER.debug("1:1 aligned paragraph pair {}", i);
129     }
130   }
131 
132   /**
133    * M:N paragraph alignment using dynamic programming.
134    * Finds optimal grouping of source and target paragraphs.
135    * Marks consumed TUs as non-translatable so they're filtered by base class.
136    */
137   private void performManyToManyAlignment(List<ITextUnit> sourceTUs, List<ITextUnit> targetTUs) {
138     int m = sourceTUs.size();
139     int n = targetTUs.size();
140 
141     // Build similarity matrix between all source and target paragraphs
142     double[][] simMatrix = new double[m][n];
143 
144     for (int i = 0; i < m; i++) {
145       String srcText = getPlainText(sourceTUs.get(i));
146 
147       for (int j = 0; j < n; j++) {
148         String trgText = getPlainText(targetTUs.get(j));
149         simMatrix[i][j] = aligner.calculateParagraphSimilarity(
150             srcText.toLowerCase().trim(),
151             trgText.toLowerCase().trim(),
152             getSourceLocale(),
153             getTargetLocale());
154       }
155     }
156 
157     // Find paragraph groups using a windowed approach
158     List<ParagraphMatch> matches = findParagraphMatches(simMatrix);
159 
160     // Apply the matches - consumed TUs are marked as non-translatable
161     int totalConsumed = 0;
162     for (ParagraphMatch match : matches) {
163       List<ITextUnit> consumed = applyParagraphMatch(match, sourceTUs, targetTUs);
164       totalConsumed += consumed.size();
165     }
166 
167     if (totalConsumed > 0) {
168       LOGGER.info("Marked {} source TUs as non-translatable (merged into other TUs)",
169           totalConsumed);
170     }
171   }
172 
173   /**
174    * Find paragraph matches using a greedy windowed approach.
175    * Groups consecutive source paragraphs and matches them to target groups.
176    */
177   private List<ParagraphMatch> findParagraphMatches(double[][] simMatrix) {
178     List<ParagraphMatch> matches = new ArrayList<>();
179     int m = simMatrix.length;
180     int n = simMatrix[0].length;
181 
182     boolean[] srcUsed = new boolean[m];
183     boolean[] trgUsed = new boolean[n];
184 
185     int srcPtr = 0;
186     int trgPtr = 0;
187 
188     while (srcPtr < m) {
189       // Try different source window sizes (1 to 6 paragraphs)
190       ParagraphMatch bestMatch = null;
191       double bestScore = -1.0;
192 
193       for (int srcWindow = 1; srcWindow <= Math.min(6, m - srcPtr); srcWindow++) {
194         // Check if any source in this window is already used
195         boolean srcConflict = false;
196         for (int s = srcPtr; s < srcPtr + srcWindow; s++) {
197           if (srcUsed[s]) {
198             srcConflict = true;
199             break;
200           }
201         }
202         if (srcConflict)
203           break;
204 
205         // Try different target window sizes
206         for (int trgWindow = 0; trgWindow <= Math.min(6, n - trgPtr); trgWindow++) {
207           // Check if any target in this window is already used
208           boolean trgConflict = false;
209           for (int t = trgPtr; t < trgPtr + trgWindow; t++) {
210             if (trgUsed[t]) {
211               trgConflict = true;
212               break;
213             }
214           }
215           if (trgConflict)
216             break;
217 
218           // Calculate combined similarity for this match
219           double score = calculateGroupSimilarity(
220               simMatrix, srcPtr, srcWindow, trgPtr, trgWindow);
221 
222           // Penalize mismatched group sizes
223           double sizePenalty = Math.abs(srcWindow - trgWindow) * 0.05;
224           double adjustedScore = score - sizePenalty;
225 
226           if (adjustedScore > bestScore) {
227             bestScore = score; // Store unadjusted for threshold
228             bestMatch = new ParagraphMatch(srcPtr, srcWindow, trgPtr, trgWindow, score);
229           }
230         }
231       }
232 
233       if (bestMatch != null && (bestMatch.targetCount > 0
234           ? bestScore >= HeuristicAligner.PARAGRAPH_MATCH_THRESHOLD
235           : bestMatch.sourceCount <= 3)) { // Allow small deletions
236 
237         // Mark as used
238         for (int s = bestMatch.sourceStart; s < bestMatch.sourceStart
239             + bestMatch.sourceCount; s++) {
240           srcUsed[s] = true;
241         }
242         for (int t = bestMatch.targetStart; t < bestMatch.targetStart
243             + bestMatch.targetCount; t++) {
244           trgUsed[t] = true;
245         }
246 
247         matches.add(bestMatch);
248         srcPtr = bestMatch.sourceStart + bestMatch.sourceCount;
249         trgPtr = bestMatch.targetStart + bestMatch.targetCount;
250 
251         LOGGER.info("Paragraph match: s[{}..{}] -> t[{}..{}] (score: {:.3f})",
252             bestMatch.sourceStart, bestMatch.sourceStart + bestMatch.sourceCount - 1,
253             bestMatch.targetStart, bestMatch.targetStart + bestMatch.targetCount - 1,
254             bestScore);
255       } else {
256         // No good match, advance by 1
257         srcPtr++;
258       }
259     }
260 
261     return matches;
262   }
263 
264   /**
265    * Calculate average similarity between a group of source and target paragraphs
266    */
267   private double calculateGroupSimilarity(double[][] simMatrix,
268       int srcStart, int srcCount, int trgStart, int trgCount) {
269 
270     if (srcCount == 0 || trgCount == 0) {
271       return 0.0; // Deletion or insertion
272     }
273 
274     double totalScore = 0.0;
275     int count = 0;
276 
277     for (int i = srcStart; i < srcStart + srcCount; i++) {
278       for (int j = trgStart; j < trgStart + trgCount; j++) {
279         totalScore += simMatrix[i][j];
280         count++;
281       }
282     }
283 
284     return count > 0 ? totalScore / count : 0.0;
285   }
286 
287   /**
288    * Apply a paragraph match by merging source and target paragraphs.
289    * Returns list of consumed source TUs that should be marked as non-translatable.
290    */
291   private List<ITextUnit> applyParagraphMatch(ParagraphMatch match, List<ITextUnit> sourceTUs,
292       List<ITextUnit> targetTUs) {
293 
294     List<ITextUnit> consumedTUs = new ArrayList<>();
295 
296     // Collect source TUs
297     List<ITextUnit> srcGroup = new ArrayList<>();
298     for (int i = match.sourceStart; i < match.sourceStart + match.sourceCount; i++) {
299       srcGroup.add(sourceTUs.get(i));
300     }
301 
302     // Collect target TUs
303     List<ITextUnit> trgGroup = new ArrayList<>();
304     for (int i = match.targetStart; i < match.targetStart + match.targetCount; i++) {
305       trgGroup.add(targetTUs.get(i));
306     }
307 
308     if (srcGroup.isEmpty()) {
309       return consumedTUs;
310     }
311 
312     // If srcGroup and trgGroup contain the same number of TUs, we align the TUs 1:1
313     if (srcGroup.size() == trgGroup.size()) {
314       for (int i = 0; i < srcGroup.size(); i++) {
315         ITextUnit srcTu = srcGroup.get(i);
316         ITextUnit trgTu = trgGroup.get(i);
317         
318         alignSentencesInParagraphPair(srcTu, trgTu);
319       }
320       
321       return consumedTUs;
322     }
323 
324     // Primary source TU (first in group) - this is the one we'll keep
325     ITextUnit primarySrc = srcGroup.get(0);
326 
327     // Merge additional source TUs into primary
328     for (int i = 1; i < srcGroup.size(); i++) {
329       ITextUnit additionalSrc = srcGroup.get(i);
330 
331       LOGGER.debug("Merging source TU {} into {}",
332           additionalSrc.getId(), primarySrc.getId());
333 
334       // Merge all segments from additional TU into primary
335       for (Segment seg : additionalSrc.getSource().getSegments()) {
336         primarySrc.getSource().append(seg.clone());
337       }
338 
339       // Mark as non-translatable and clear source so it won't be output
340       additionalSrc.setIsTranslatable(false);
341       additionalSrc.getSource().clear();
342 
343       consumedTUs.add(additionalSrc);
344     }
345 
346     if (trgGroup.isEmpty()) {
347       // No target - create empty
348       LOGGER.debug("Creating empty target for source group starting at {}", match.sourceStart);
349       createEmptyTargetSegments(primarySrc);
350 
351     } else {
352       // Merge target TUs into a single virtual target
353       ITextUnit mergedTarget = trgGroup.get(0).clone();
354 
355       for (int i = 1; i < trgGroup.size(); i++) {
356         ITextUnit additionalTrg = trgGroup.get(i);
357 
358         for (Segment seg : additionalTrg.getSource().getSegments()) {
359           mergedTarget.getSource().append(seg.clone());
360         }
361       }
362 
363       // Align sentences between merged source and merged target
364       alignSentencesInParagraphPair(primarySrc, mergedTarget);
365     }
366 
367     return consumedTUs;
368   }
369 
370   /**
371    * Align sentences within a single paragraph pair using the heuristic aligner.
372    */
373   private void alignSentencesInParagraphPair(ITextUnit sourceTu, ITextUnit targetTu) {
374     // Prepare source and target segment lists BEFORE modifying anything
375     List<Segment> sourceSegments = new ArrayList<>(sourceTu.getSource().getSegments().asList());
376     List<Segment> targetSegments = new ArrayList<>(targetTu.getSource().getSegments().asList());
377 
378     if (sourceSegments.isEmpty()) {
379       LOGGER.warn("Source has no segments in paragraph pair");
380       return;
381     }
382 
383     if (targetSegments.isEmpty()) {
384       LOGGER.warn("Target has no segments in paragraph pair");
385       createEmptyTargetSegments(sourceTu);
386       return;
387     }
388 
389     // Temporarily set target so aligner can work with it
390     TextContainer tempTarget = new TextContainer();
391 
392     for (Segment seg : targetSegments) {
393       tempTarget.getSegments().append(seg.clone());
394     }
395 
396     sourceTu.setTarget(getTargetLocale(), tempTarget);
397 
398     // Call aligner to get sentence matches
399     List<SentenceMatch> matches = aligner.alignSentencesInTu(
400         sourceTu, getSourceLocale(), getTargetLocale());
401 
402     // Create final target container and populate based on matches
403     TextContainer finalTarget = sourceTu.createTarget(getTargetLocale(), false,
404         IResource.CREATE_EMPTY);
405 
406     finalTarget.clear();
407 
408     // Apply alignment results
409     for (SentenceMatch m : matches) {
410       if (m.sourceIndex >= 0 && m.sourceIndex < sourceSegments.size()) {
411         Segment srcSeg = sourceSegments.get(m.sourceIndex);
412 
413         if (m.targetIndex >= 0 && m.targetIndex < targetSegments.size()) {
414           Segment trgSeg = targetSegments.get(m.targetIndex);
415           finalTarget.append(new Segment(srcSeg.getId(), trgSeg.text.clone()));
416 
417         } else {
418           // We got -1 for the tseg index matching the current sseg index
419           if (sourceSegments.size() == 1 || targetSegments.size() == 1) {
420             // 1:1, M:1, 1:N cases
421             finalTarget = targetTu.getSource();
422             sourceTu.setTarget(getTargetLocale(), finalTarget);
423 
424             if (targetSegments.size() == 1) {
425               sourceTu.getSource().joinAll();
426             }
427 
428             if (sourceSegments.size() == 1) {
429               finalTarget.joinAll();
430             }
431 
432             break;
433 
434           } else {
435             // TODO test real M:N cases
436             finalTarget.append(new Segment(srcSeg.getId(), new TextFragment("???")));
437           }
438         }
439       }
440     }
441 
442     finalTarget.setHasBeenSegmentedFlag(true);
443     finalTarget.getSegments().setAlignmentStatus(AlignmentStatus.ALIGNED);
444   }
445 
446   /**
447    * Create empty target segments for a source TU with no match
448    */
449   private void createEmptyTargetSegments(ITextUnit tu) {
450     tu.createTarget(getTargetLocale(), false, IResource.CREATE_EMPTY);
451     for (Segment s : tu.getSource().getSegments()) {
452       tu.getTarget(getTargetLocale()).append(new Segment(s.getId(), new TextFragment("")));
453     }
454   }
455 
456   /**
457    * Filter out MT artifacts from target text units.
458    * Detect and remove duplicate source paragraphs.
459    * Filter source metadata but NOT target metadata.
460    */
461   private void filterMtArtifacts(List<ITextUnit> sourceTUs, List<ITextUnit> targetTUs) {
462     // Filter targets - only MT artifacts, keep "passage" and other metadata
463     List<ITextUnit> filtered = new ArrayList<>();
464     for (ITextUnit tu : targetTUs) {
465       String text = tu.getSource().toString().toLowerCase().trim();
466 
467       // Skip ONLY MT artifacts
468       if (text.matches(".*\\bmachine\\b.*translated.*|.*google.*|.*auto.*translated.*")) {
469         LOGGER.info("Filtered MT artifact: {}", text);
470         continue;
471       }
472 
473       // Keep everything else, including "passage" metadata
474       filtered.add(tu);
475     }
476 
477     int removedCount = targetTUs.size() - filtered.size();
478 
479     if (removedCount > 0) {
480       LOGGER.info("Filtered {} MT artifact paragraphs from target", removedCount);
481     }
482 
483     targetTUs.clear();
484     targetTUs.addAll(filtered);
485 
486     // Filter source: metadata AND duplicates
487     // List<ITextUnit> srcFiltered = new ArrayList<>();
488     // List<String> seenTexts = new ArrayList<>();
489     //
490     // for (ITextUnit tu : sourceTUs) {
491     // String text = tu.getSource().toString().toLowerCase().trim();
492     //
493     // // Skip metadata markers from source only
494     // if (text.equals("passage") || text.isEmpty()) {
495     // LOGGER.info("Filtered source metadata/empty: {}", text);
496     // continue;
497     // }
498     //
499     // // Check for duplicates
500     // boolean isDuplicate = false;
501     //
502     // for (String seenText : seenTexts) {
503     // if (text.equals(seenText)) {
504     // LOGGER.info("Filtered duplicate source paragraph: {}",
505     // text.length() > 50 ? text.substring(0, 50) + "..." : text);
506     //
507     // isDuplicate = true;
508     // break;
509     // }
510     // }
511     //
512     // if (!isDuplicate) {
513     // srcFiltered.add(tu);
514     // seenTexts.add(text);
515     // }
516     // }
517     //
518     // if (sourceTUs.size() != srcFiltered.size()) {
519     // LOGGER.info("Filtered {} source metadata/duplicate paragraphs",
520     // sourceTUs.size() - srcFiltered.size());
521     // sourceTUs.clear();
522     // sourceTUs.addAll(srcFiltered);
523     // }
524   }
525 
526   /**
527    * Extract plain text from a text unit
528    */
529   private String getPlainText(ITextUnit tu) {
530     StringBuilder sb = new StringBuilder();
531     for (TextPart tp : tu.getSource().getParts()) {
532       if (sb.length() > 0) {
533         sb.append(" ");
534       }
535       sb.append(tp.getContent().getText());
536     }
537     return sb.toString().trim();
538   }
539 
540   /**
541    * Data class for paragraph match results
542    */
543   static class ParagraphMatch {
544     int sourceStart;
545     int sourceCount;
546     int targetStart;
547     int targetCount;
548     double score;
549 
550     ParagraphMatch(int ss, int sc, int ts, int tc, double score) {
551       this.sourceStart = ss;
552       this.sourceCount = sc;
553       this.targetStart = ts;
554       this.targetCount = tc;
555       this.score = score;
556     }
557   }
558 
559   /**
560    * Simple data class to hold sentence match results
561    */
562   static class SentenceMatch {
563     public int sourceIndex;
564     public int targetIndex = -1;
565     public double score;
566 
567     public SentenceMatch(int s, int t, double sc) {
568       this.sourceIndex = s;
569       this.targetIndex = t;
570       this.score = sc;
571     }
572   }
573 }