View Javadoc
1   /**
2    * ===========================================================================
3    * Additional changes Copyright (C) 2009-2011 by the Okapi Framework contributors
4    * ===========================================================================
5    */
6   package net.sf.okapi.steps.sentencealigner;
7   
8   import java.util.List;
9   
10  import org.slf4j.Logger;
11  import org.slf4j.LoggerFactory;
12  
13  import net.sf.okapi.common.IResource;
14  import net.sf.okapi.common.LocaleId;
15  import net.sf.okapi.common.resource.AlignmentStatus;
16  import net.sf.okapi.common.resource.ITextUnit;
17  import net.sf.okapi.common.resource.Segment;
18  import net.sf.okapi.common.resource.TextContainer;
19  import net.sf.okapi.common.resource.TextFragment;
20  import net.sf.okapi.common.resource.TextPart;
21  import net.sf.okapi.common.resource.TextUnit;
22  import net.sf.okapi.steps.gcaligner.AlignmentFunction;
23  import net.sf.okapi.steps.gcaligner.AlignmentScorer;
24  import net.sf.okapi.steps.gcaligner.DpMatrix;
25  import net.sf.okapi.steps.gcaligner.DpMatrixCell;
26  import net.sf.okapi.steps.gcaligner.Penalties;
27  
28  /**
29   * SentenceAligner aligns source and target (paragraph) {@link TextUnit}s.
30   *
31   * @version 1.47.0
32   *
33   *          FIX: Preserves inter-segment whitespace during alignment by:
34   *          1. Copying target container structure FIRST (all segments + inter-segment parts)
35   *          2. Then updating segment IDs and merging as needed
36   *          3. Never clearing containers after initial copy
37   */
38  public class SentenceAligner {
39    private final Logger LOGGER = LoggerFactory.getLogger(getClass());
40    private static final long MAX_CELL_SIZE = 80000L;
41    private final List<AlignmentScorer<Segment>> scorerList;
42  
43    public SentenceAligner(List<AlignmentScorer<Segment>> scorerList) {
44      this.scorerList = scorerList;
45    }
46  
47    public ITextUnit align(ITextUnit sourceParagraph, ITextUnit targetParagraph, LocaleId srcLocale,
48        LocaleId trgLocale, boolean outputOneTOneMatchesOnly) {
49      return alignWithoutSkeletonAlignment(sourceParagraph, targetParagraph, srcLocale, trgLocale,
50          outputOneTOneMatchesOnly);
51    }
52  
53    public ITextUnit align(ITextUnit bilingualParagraph, LocaleId srcLocale, LocaleId trgLocale,
54        boolean outputOneTOneMatchesOnly) {
55      return alignWithoutSkeletonAlignment(bilingualParagraph, srcLocale, trgLocale,
56          outputOneTOneMatchesOnly);
57    }
58  
59    private ITextUnit alignWithoutSkeletonAlignment(ITextUnit sourceParagraph,
60        ITextUnit targetParagraph, LocaleId srcLocale, LocaleId trgLocale,
61        boolean outputOneTOneMatchesOnly) {
62      AlignmentFunction<Segment> alignmentFunction = new AlignmentFunction<>(srcLocale,
63          trgLocale, scorerList, new Penalties());
64      return alignSegments(sourceParagraph, targetParagraph, srcLocale, trgLocale,
65          alignmentFunction, outputOneTOneMatchesOnly);
66    }
67  
68    private ITextUnit alignWithoutSkeletonAlignment(ITextUnit bilingualParagraph, LocaleId srcLocale,
69        LocaleId trgLocale, boolean outputOneTOneMatchesOnly) {
70      AlignmentFunction<Segment> alignmentFunction = new AlignmentFunction<>(srcLocale,
71          trgLocale, scorerList, new Penalties());
72      return alignSegments(bilingualParagraph, srcLocale, trgLocale, alignmentFunction,
73          outputOneTOneMatchesOnly);
74    }
75  
76    /*
77     * --------------------------------------------------------------
78     * MONOLINGUAL CASE (sourceParagraph + separate targetParagraph)
79     * --------------------------------------------------------------
80     */
81    private ITextUnit alignSegments(ITextUnit sourceParagraph, ITextUnit targetParagraph,
82        LocaleId srcLocale, LocaleId trgLocale, AlignmentFunction<Segment> alignmentFunction,
83        boolean outputOneTOneMatchesOnly) {
84  
85      if (sourceParagraph.getSource().getSegments().count()
86          * targetParagraph.getSource().getSegments().count() > MAX_CELL_SIZE) {
87        throw new IllegalArgumentException("Too many segments. Can only align "
88            + MAX_CELL_SIZE
89            + ". Where the number equals the source segments times the target segments.");
90      }
91  
92      DpMatrix<Segment> matrix = new DpMatrix<>(sourceParagraph.getSource().getSegments().asList(),
93          targetParagraph.getSource().getSegments().asList(), alignmentFunction);
94  
95      List<DpMatrixCell> result = matrix.align();
96  
97      String srcTuid = sourceParagraph.getName() == null ? "unknown" : sourceParagraph.getName();
98  
99      // ---- 1. Copy target structure COMPLETELY (preserving ALL parts) ----
100     sourceParagraph.createTarget(trgLocale, false, IResource.CREATE_EMPTY);
101     TextContainer srcCont = sourceParagraph.getSource();
102     TextContainer trgCont = sourceParagraph.getTarget(trgLocale);
103 
104     if (targetParagraph.getSource().getSegments().count() == 0) {
105       LOGGER.warn("Target paragraph {} has no segments – using empty target container.", srcTuid);
106       return sourceParagraph;
107     }
108 
109     trgCont.clear();
110 
111     // Copy ALL parts from target (including inter-segment whitespace)
112     // This preserves the complete structure including spaces
113     for (TextPart part : targetParagraph.getSource().getParts()) {
114       trgCont.append(part.clone());
115     }
116     trgCont.setHasBeenSegmentedFlag(true);
117 
118     // ---- 2. Process alignment results and update IDs / merge segments ----
119     int srcPos = 0;
120     int trgPos = 0;
121 
122     for (DpMatrixCell cell : result) {
123       if (outputOneTOneMatchesOnly) {
124         if (cell.getState() == DpMatrixCell.MATCH) {
125           trgCont.getSegments().get(trgPos).id = srcCont.getSegments().get(srcPos).id;
126           srcPos++;
127           trgPos++;
128         }
129         continue;
130       }
131 
132       switch (cell.getState()) {
133       case DpMatrixCell.MATCH:
134         // Simple 1:1 match - just update target segment ID
135         trgCont.getSegments().get(trgPos).id = srcCont.getSegments().get(srcPos).id;
136         srcPos++;
137         trgPos++;
138         break;
139 
140       case DpMatrixCell.MULTI_MATCH:
141         int xBegin = cell.getMultiMatchXIndexBegin();
142         int xEnd = cell.getMultiMatchXIndexEnd();
143         int yBegin = cell.getMultiMatchYIndexBegin();
144         int yEnd = cell.getMultiMatchYIndexEnd();
145 
146         int srcLen = xEnd - xBegin;
147         int trgLen = yEnd - yBegin;
148 
149         // Get the FIRST source segment ID (this is the group ID)
150         String groupId = srcCont.getSegments().get(srcPos).id;
151 
152         // Merge source segments (keeping the first ID)
153         for (int i = 0; i < srcLen - 1; i++) {
154           srcCont.getSegments().joinWithNext(srcPos);
155         }
156 
157         // Merge target segments and assign group ID
158         for (int i = 0; i < trgLen - 1; i++) {
159           trgCont.getSegments().joinWithNext(trgPos);
160         }
161 
162         // After merging, update the merged segment's ID to the group ID
163         trgCont.getSegments().get(trgPos).id = groupId;
164 
165         // Move past the merged segments (now count as 1 each)
166         srcPos++;
167         trgPos++;
168 
169         LOGGER.warn("{}\nMulti-Segment Match (TU ID: {}): Merged {}:{} segments.",
170             matrix.getAlignmentElementX(xBegin).toString(), srcTuid, srcLen, trgLen);
171         break;
172 
173       case DpMatrixCell.DELETED:
174         // Source segment has no corresponding target - insert empty target
175         Segment srcSeg = matrix.getAlignmentElementX(cell.getXindex());
176         trgCont.getSegments().insert(trgPos,
177             new Segment(srcCont.getSegments().get(srcPos).id, new TextFragment("")));
178         srcPos++;
179         trgPos++;
180         LOGGER.warn("{}\nTarget segment deleted (TU ID: {}): Inserted empty target.",
181             srcSeg.toString(), srcTuid);
182         break;
183 
184       case DpMatrixCell.INSERTED:
185         // Target segment has no corresponding source - insert empty source
186         Segment trgSeg = matrix.getAlignmentElementY(cell.getYindex());
187         srcCont.getSegments().insert(srcPos,
188             new Segment(trgCont.getSegments().get(trgPos).id, new TextFragment("")));
189         srcPos++;
190         trgPos++;
191         LOGGER.warn("{}\nSource segment deleted (TU ID: {}): Inserted empty source.",
192             trgSeg.toString(), srcTuid);
193         break;
194       }
195     }
196 
197     trgCont.getSegments().setAlignmentStatus(AlignmentStatus.ALIGNED);
198     return sourceParagraph;
199   }
200 
201   /*
202    * --------------------------------------------------------------
203    * BILINGUAL CASE (single TextUnit with source + target)
204    * --------------------------------------------------------------
205    */
206   private ITextUnit alignSegments(ITextUnit bilingualParagraph, LocaleId srcLocale,
207       LocaleId trgLocale, AlignmentFunction<Segment> alignmentFunction,
208       boolean outputOneTOneMatchesOnly) {
209 
210     if (bilingualParagraph.getSource().getSegments().count()
211         * bilingualParagraph.getTarget(trgLocale).getSegments().count() > MAX_CELL_SIZE) {
212       throw new IllegalArgumentException("Too many segments. Can only align "
213           + MAX_CELL_SIZE
214           + ". Where the number equals the source segments times the target segments.");
215     }
216 
217     DpMatrix<Segment> matrix = new DpMatrix<>(bilingualParagraph.getSource().getSegments().asList(),
218         bilingualParagraph.getTarget(trgLocale).getSegments().asList(), alignmentFunction);
219 
220     List<DpMatrixCell> result = matrix.align();
221 
222     String srcTuid = bilingualParagraph.getName() == null ? "unknown"
223         : bilingualParagraph.getName();
224 
225     TextContainer srcCont = bilingualParagraph.getSource();
226     TextContainer trgCont = bilingualParagraph.getTarget(trgLocale);
227 
228     if (srcCont.getSegments().count() == 0 || trgCont.getSegments().count() == 0) {
229       LOGGER.warn("Empty segments in bilingual paragraph {} – skipping alignment.", srcTuid);
230       return bilingualParagraph;
231     }
232 
233     // Target container already exists with all its parts - don't copy, just process
234     int srcPos = 0;
235     int trgPos = 0;
236 
237     for (DpMatrixCell cell : result) {
238       if (outputOneTOneMatchesOnly) {
239         if (cell.getState() == DpMatrixCell.MATCH) {
240           trgCont.getSegments().get(trgPos).id = srcCont.getSegments().get(srcPos).id;
241           srcPos++;
242           trgPos++;
243         }
244         continue;
245       }
246 
247       switch (cell.getState()) {
248       case DpMatrixCell.MATCH:
249         trgCont.getSegments().get(trgPos).id = srcCont.getSegments().get(srcPos).id;
250         srcPos++;
251         trgPos++;
252         break;
253 
254       case DpMatrixCell.MULTI_MATCH:
255         int xBegin = cell.getMultiMatchXIndexBegin();
256         int xEnd = cell.getMultiMatchXIndexEnd();
257         int yBegin = cell.getMultiMatchYIndexBegin();
258         int yEnd = cell.getMultiMatchYIndexEnd();
259 
260         int srcLen = xEnd - xBegin;
261         int trgLen = yEnd - yBegin;
262 
263         // Get the FIRST source segment ID (this is the group ID)
264         String groupId = srcCont.getSegments().get(srcPos).id;
265 
266         // Merge source segments
267         for (int i = 0; i < srcLen - 1; i++)
268           srcCont.getSegments().joinWithNext(srcPos);
269 
270         // Merge target segments
271         for (int i = 0; i < trgLen - 1; i++)
272           trgCont.getSegments().joinWithNext(trgPos);
273 
274         // Update merged target segment ID
275         trgCont.getSegments().get(trgPos).id = groupId;
276 
277         srcPos++;
278         trgPos++;
279 
280         LOGGER.warn("{}\nMulti-Segment Match (TU ID: {}): Merged {}:{} segments.",
281             matrix.getAlignmentElementX(xBegin).toString(), srcTuid, srcLen, trgLen);
282         break;
283 
284       case DpMatrixCell.DELETED:
285         Segment srcSeg = matrix.getAlignmentElementX(cell.getXindex());
286         trgCont.getSegments().insert(trgPos,
287             new Segment(srcCont.getSegments().get(srcPos).id, new TextFragment("")));
288         srcPos++;
289         trgPos++;
290         LOGGER.warn("{}\nTarget segment deleted (TU ID: {}): Inserted empty target.",
291             srcSeg.toString(), srcTuid);
292         break;
293 
294       case DpMatrixCell.INSERTED:
295         Segment trgSeg = matrix.getAlignmentElementY(cell.getYindex());
296         srcCont.getSegments().insert(srcPos,
297             new Segment(trgCont.getSegments().get(trgPos).id, new TextFragment("")));
298         srcPos++;
299         trgPos++;
300         LOGGER.warn("{}\nSource segment deleted (TU ID: {}): Inserted empty source.",
301             trgSeg.toString(), srcTuid);
302         break;
303       }
304     }
305 
306     trgCont.getSegments().setAlignmentStatus(AlignmentStatus.ALIGNED);
307     return bilingualParagraph;
308   }
309 }