1
2
3
4
5
6 package net.sf.okapi.steps.sentencealigner;
7
8 import java.util.List;
9
10 import org.slf4j.Logger;
11 import org.slf4j.LoggerFactory;
12
13 import net.sf.okapi.common.IResource;
14 import net.sf.okapi.common.LocaleId;
15 import net.sf.okapi.common.resource.AlignmentStatus;
16 import net.sf.okapi.common.resource.ITextUnit;
17 import net.sf.okapi.common.resource.Segment;
18 import net.sf.okapi.common.resource.TextContainer;
19 import net.sf.okapi.common.resource.TextFragment;
20 import net.sf.okapi.common.resource.TextPart;
21 import net.sf.okapi.common.resource.TextUnit;
22 import net.sf.okapi.steps.gcaligner.AlignmentFunction;
23 import net.sf.okapi.steps.gcaligner.AlignmentScorer;
24 import net.sf.okapi.steps.gcaligner.DpMatrix;
25 import net.sf.okapi.steps.gcaligner.DpMatrixCell;
26 import net.sf.okapi.steps.gcaligner.Penalties;
27
28
29
30
31
32
33
34
35
36
37
38 public class SentenceAligner {
39 private final Logger LOGGER = LoggerFactory.getLogger(getClass());
40 private static final long MAX_CELL_SIZE = 80000L;
41 private final List<AlignmentScorer<Segment>> scorerList;
42
43 public SentenceAligner(List<AlignmentScorer<Segment>> scorerList) {
44 this.scorerList = scorerList;
45 }
46
47 public ITextUnit align(ITextUnit sourceParagraph, ITextUnit targetParagraph, LocaleId srcLocale,
48 LocaleId trgLocale, boolean outputOneTOneMatchesOnly) {
49 return alignWithoutSkeletonAlignment(sourceParagraph, targetParagraph, srcLocale, trgLocale,
50 outputOneTOneMatchesOnly);
51 }
52
53 public ITextUnit align(ITextUnit bilingualParagraph, LocaleId srcLocale, LocaleId trgLocale,
54 boolean outputOneTOneMatchesOnly) {
55 return alignWithoutSkeletonAlignment(bilingualParagraph, srcLocale, trgLocale,
56 outputOneTOneMatchesOnly);
57 }
58
59 private ITextUnit alignWithoutSkeletonAlignment(ITextUnit sourceParagraph,
60 ITextUnit targetParagraph, LocaleId srcLocale, LocaleId trgLocale,
61 boolean outputOneTOneMatchesOnly) {
62 AlignmentFunction<Segment> alignmentFunction = new AlignmentFunction<>(srcLocale,
63 trgLocale, scorerList, new Penalties());
64 return alignSegments(sourceParagraph, targetParagraph, srcLocale, trgLocale,
65 alignmentFunction, outputOneTOneMatchesOnly);
66 }
67
68 private ITextUnit alignWithoutSkeletonAlignment(ITextUnit bilingualParagraph, LocaleId srcLocale,
69 LocaleId trgLocale, boolean outputOneTOneMatchesOnly) {
70 AlignmentFunction<Segment> alignmentFunction = new AlignmentFunction<>(srcLocale,
71 trgLocale, scorerList, new Penalties());
72 return alignSegments(bilingualParagraph, srcLocale, trgLocale, alignmentFunction,
73 outputOneTOneMatchesOnly);
74 }
75
76
77
78
79
80
81 private ITextUnit alignSegments(ITextUnit sourceParagraph, ITextUnit targetParagraph,
82 LocaleId srcLocale, LocaleId trgLocale, AlignmentFunction<Segment> alignmentFunction,
83 boolean outputOneTOneMatchesOnly) {
84
85 if (sourceParagraph.getSource().getSegments().count()
86 * targetParagraph.getSource().getSegments().count() > MAX_CELL_SIZE) {
87 throw new IllegalArgumentException("Too many segments. Can only align "
88 + MAX_CELL_SIZE
89 + ". Where the number equals the source segments times the target segments.");
90 }
91
92 DpMatrix<Segment> matrix = new DpMatrix<>(sourceParagraph.getSource().getSegments().asList(),
93 targetParagraph.getSource().getSegments().asList(), alignmentFunction);
94
95 List<DpMatrixCell> result = matrix.align();
96
97 String srcTuid = sourceParagraph.getName() == null ? "unknown" : sourceParagraph.getName();
98
99
100 sourceParagraph.createTarget(trgLocale, false, IResource.CREATE_EMPTY);
101 TextContainer srcCont = sourceParagraph.getSource();
102 TextContainer trgCont = sourceParagraph.getTarget(trgLocale);
103
104 if (targetParagraph.getSource().getSegments().count() == 0) {
105 LOGGER.warn("Target paragraph {} has no segments – using empty target container.", srcTuid);
106 return sourceParagraph;
107 }
108
109 trgCont.clear();
110
111
112
113 for (TextPart part : targetParagraph.getSource().getParts()) {
114 trgCont.append(part.clone());
115 }
116 trgCont.setHasBeenSegmentedFlag(true);
117
118
119 int srcPos = 0;
120 int trgPos = 0;
121
122 for (DpMatrixCell cell : result) {
123 if (outputOneTOneMatchesOnly) {
124 if (cell.getState() == DpMatrixCell.MATCH) {
125 trgCont.getSegments().get(trgPos).id = srcCont.getSegments().get(srcPos).id;
126 srcPos++;
127 trgPos++;
128 }
129 continue;
130 }
131
132 switch (cell.getState()) {
133 case DpMatrixCell.MATCH:
134
135 trgCont.getSegments().get(trgPos).id = srcCont.getSegments().get(srcPos).id;
136 srcPos++;
137 trgPos++;
138 break;
139
140 case DpMatrixCell.MULTI_MATCH:
141 int xBegin = cell.getMultiMatchXIndexBegin();
142 int xEnd = cell.getMultiMatchXIndexEnd();
143 int yBegin = cell.getMultiMatchYIndexBegin();
144 int yEnd = cell.getMultiMatchYIndexEnd();
145
146 int srcLen = xEnd - xBegin;
147 int trgLen = yEnd - yBegin;
148
149
150 String groupId = srcCont.getSegments().get(srcPos).id;
151
152
153 for (int i = 0; i < srcLen - 1; i++) {
154 srcCont.getSegments().joinWithNext(srcPos);
155 }
156
157
158 for (int i = 0; i < trgLen - 1; i++) {
159 trgCont.getSegments().joinWithNext(trgPos);
160 }
161
162
163 trgCont.getSegments().get(trgPos).id = groupId;
164
165
166 srcPos++;
167 trgPos++;
168
169 LOGGER.warn("{}\nMulti-Segment Match (TU ID: {}): Merged {}:{} segments.",
170 matrix.getAlignmentElementX(xBegin).toString(), srcTuid, srcLen, trgLen);
171 break;
172
173 case DpMatrixCell.DELETED:
174
175 Segment srcSeg = matrix.getAlignmentElementX(cell.getXindex());
176 trgCont.getSegments().insert(trgPos,
177 new Segment(srcCont.getSegments().get(srcPos).id, new TextFragment("")));
178 srcPos++;
179 trgPos++;
180 LOGGER.warn("{}\nTarget segment deleted (TU ID: {}): Inserted empty target.",
181 srcSeg.toString(), srcTuid);
182 break;
183
184 case DpMatrixCell.INSERTED:
185
186 Segment trgSeg = matrix.getAlignmentElementY(cell.getYindex());
187 srcCont.getSegments().insert(srcPos,
188 new Segment(trgCont.getSegments().get(trgPos).id, new TextFragment("")));
189 srcPos++;
190 trgPos++;
191 LOGGER.warn("{}\nSource segment deleted (TU ID: {}): Inserted empty source.",
192 trgSeg.toString(), srcTuid);
193 break;
194 }
195 }
196
197 trgCont.getSegments().setAlignmentStatus(AlignmentStatus.ALIGNED);
198 return sourceParagraph;
199 }
200
201
202
203
204
205
206 private ITextUnit alignSegments(ITextUnit bilingualParagraph, LocaleId srcLocale,
207 LocaleId trgLocale, AlignmentFunction<Segment> alignmentFunction,
208 boolean outputOneTOneMatchesOnly) {
209
210 if (bilingualParagraph.getSource().getSegments().count()
211 * bilingualParagraph.getTarget(trgLocale).getSegments().count() > MAX_CELL_SIZE) {
212 throw new IllegalArgumentException("Too many segments. Can only align "
213 + MAX_CELL_SIZE
214 + ". Where the number equals the source segments times the target segments.");
215 }
216
217 DpMatrix<Segment> matrix = new DpMatrix<>(bilingualParagraph.getSource().getSegments().asList(),
218 bilingualParagraph.getTarget(trgLocale).getSegments().asList(), alignmentFunction);
219
220 List<DpMatrixCell> result = matrix.align();
221
222 String srcTuid = bilingualParagraph.getName() == null ? "unknown"
223 : bilingualParagraph.getName();
224
225 TextContainer srcCont = bilingualParagraph.getSource();
226 TextContainer trgCont = bilingualParagraph.getTarget(trgLocale);
227
228 if (srcCont.getSegments().count() == 0 || trgCont.getSegments().count() == 0) {
229 LOGGER.warn("Empty segments in bilingual paragraph {} – skipping alignment.", srcTuid);
230 return bilingualParagraph;
231 }
232
233
234 int srcPos = 0;
235 int trgPos = 0;
236
237 for (DpMatrixCell cell : result) {
238 if (outputOneTOneMatchesOnly) {
239 if (cell.getState() == DpMatrixCell.MATCH) {
240 trgCont.getSegments().get(trgPos).id = srcCont.getSegments().get(srcPos).id;
241 srcPos++;
242 trgPos++;
243 }
244 continue;
245 }
246
247 switch (cell.getState()) {
248 case DpMatrixCell.MATCH:
249 trgCont.getSegments().get(trgPos).id = srcCont.getSegments().get(srcPos).id;
250 srcPos++;
251 trgPos++;
252 break;
253
254 case DpMatrixCell.MULTI_MATCH:
255 int xBegin = cell.getMultiMatchXIndexBegin();
256 int xEnd = cell.getMultiMatchXIndexEnd();
257 int yBegin = cell.getMultiMatchYIndexBegin();
258 int yEnd = cell.getMultiMatchYIndexEnd();
259
260 int srcLen = xEnd - xBegin;
261 int trgLen = yEnd - yBegin;
262
263
264 String groupId = srcCont.getSegments().get(srcPos).id;
265
266
267 for (int i = 0; i < srcLen - 1; i++)
268 srcCont.getSegments().joinWithNext(srcPos);
269
270
271 for (int i = 0; i < trgLen - 1; i++)
272 trgCont.getSegments().joinWithNext(trgPos);
273
274
275 trgCont.getSegments().get(trgPos).id = groupId;
276
277 srcPos++;
278 trgPos++;
279
280 LOGGER.warn("{}\nMulti-Segment Match (TU ID: {}): Merged {}:{} segments.",
281 matrix.getAlignmentElementX(xBegin).toString(), srcTuid, srcLen, trgLen);
282 break;
283
284 case DpMatrixCell.DELETED:
285 Segment srcSeg = matrix.getAlignmentElementX(cell.getXindex());
286 trgCont.getSegments().insert(trgPos,
287 new Segment(srcCont.getSegments().get(srcPos).id, new TextFragment("")));
288 srcPos++;
289 trgPos++;
290 LOGGER.warn("{}\nTarget segment deleted (TU ID: {}): Inserted empty target.",
291 srcSeg.toString(), srcTuid);
292 break;
293
294 case DpMatrixCell.INSERTED:
295 Segment trgSeg = matrix.getAlignmentElementY(cell.getYindex());
296 srcCont.getSegments().insert(srcPos,
297 new Segment(trgCont.getSegments().get(trgPos).id, new TextFragment("")));
298 srcPos++;
299 trgPos++;
300 LOGGER.warn("{}\nSource segment deleted (TU ID: {}): Inserted empty source.",
301 trgSeg.toString(), srcTuid);
302 break;
303 }
304 }
305
306 trgCont.getSegments().setAlignmentStatus(AlignmentStatus.ALIGNED);
307 return bilingualParagraph;
308 }
309 }