View Javadoc
1   package net.sf.okapi.steps.sentencealigner;
2   
3   import static org.junit.jupiter.api.Assertions.*;
4   
5   import java.util.ArrayList;
6   import java.util.List;
7   
8   import org.junit.jupiter.api.BeforeEach;
9   import org.junit.jupiter.api.Test;
10  
11  import net.sf.okapi.common.LocaleId;
12  import net.sf.okapi.common.resource.ISegments;
13  import net.sf.okapi.common.resource.ITextUnit;
14  import net.sf.okapi.common.resource.Segment;
15  import net.sf.okapi.common.resource.TextContainer;
16  import net.sf.okapi.common.resource.TextFragment;
17  import net.sf.okapi.common.resource.TextPart;
18  import net.sf.okapi.common.resource.TextUnit;
19  import net.sf.okapi.steps.gcaligner.AlignmentScorer;
20  
21  class TestSentenceAligner {
22  
23    private SentenceAligner aligner;
24    private final LocaleId srcLoc = LocaleId.fromString("en");
25    private final LocaleId trgLoc = LocaleId.fromString("ru");
26  
27    @BeforeEach
28    void setUp() {
29      List<AlignmentScorer<Segment>> scorers = new ArrayList<>();
30      scorers.add(new FlexibleScorer());
31      aligner = new SentenceAligner(scorers);
32    }
33  
34    @Test
35    void test1to1Alignment() {
36      ITextUnit src = createTextUnit("Hello world.", "s1");
37      ITextUnit trg = createTextUnit("Привет мир.", "t1");
38  
39      ITextUnit result = aligner.align(src, trg, srcLoc, trgLoc, false);
40  
41      ISegments srcSegs = result.getSource().getSegments();
42      ISegments trgSegs = result.getTarget(trgLoc).getSegments();
43  
44      assertEquals(1, srcSegs.count());
45      assertEquals(1, trgSegs.count());
46  
47      Segment srcSeg = srcSegs.get(0);
48      Segment trgSeg = trgSegs.get(0);
49  
50      assertEquals("s1", srcSeg.getId());
51      assertEquals("s1", trgSeg.getId());
52      assertEquals("Hello world.", srcSeg.getContent().toText());
53      assertEquals("Привет мир.", trgSeg.getContent().toText());
54    }
55  
56    @Test
57    void test1to2Alignment() {
58      ITextUnit src = createTextUnit("Click OK to continue.", "s1");
59      ITextUnit trg = createTextUnit(
60          "Нажмите «ОК» для продолжения. " +
61              "После индикатора установка завершится.",
62          "t1", "t2");
63  
64      ITextUnit result = aligner.align(src, trg, srcLoc, trgLoc, false);
65  
66      ISegments srcSegs = result.getSource().getSegments();
67      ISegments trgSegs = result.getTarget(trgLoc).getSegments();
68  
69      assertEquals(1, srcSegs.count());
70      assertEquals(1, trgSegs.count());
71  
72      Segment srcSeg = srcSegs.get(0);
73      Segment trgSeg = trgSegs.get(0);
74  
75      assertEquals("s1", srcSeg.getId());
76      assertEquals("s1", trgSeg.getId());
77  
78      assertEquals("Click OK to continue.", srcSeg.getContent().toText());
79      
80      String trgContent = trgSeg.getContent().toText();
81      assertTrue(trgContent.contains("Нажмите") && trgContent.contains("После"));
82    }
83  
84    @Test
85    void test2to1Alignment() {
86      ITextUnit src = createTextUnit("First sentence. Second sentence.", "s1", "s2");
87      ITextUnit trg = createTextUnit("Первое и второе предложение.", "t1");
88  
89      ITextUnit result = aligner.align(src, trg, srcLoc, trgLoc, false);
90  
91      ISegments srcSegs = result.getSource().getSegments();
92      ISegments trgSegs = result.getTarget(trgLoc).getSegments();
93  
94      assertEquals(1, srcSegs.count());
95      assertEquals(1, trgSegs.count());
96  
97      Segment srcSeg = srcSegs.get(0);
98      Segment trgSeg = trgSegs.get(0);
99  
100     assertEquals("s1", srcSeg.getId());
101     assertEquals("s1", trgSeg.getId());
102 
103     String srcContent = srcSeg.getContent().toText();
104     assertTrue(srcContent.contains("First") && srcContent.contains("Second"));
105     assertEquals("Первое и второе предложение.", trgSeg.getContent().toText());
106   }
107 
108   @Test
109   void test2to2Alignment() {
110     ITextUnit src = createTextUnit("A. B.", "s1", "s2");
111     ITextUnit trg = createTextUnit("А. Б.", "t1", "t2");
112 
113     ITextUnit result = aligner.align(src, trg, srcLoc, trgLoc, false);
114 
115     ISegments srcSegs = result.getSource().getSegments();
116     ISegments trgSegs = result.getTarget(trgLoc).getSegments();
117 
118     assertEquals(2, srcSegs.count());
119     assertEquals(2, trgSegs.count());
120 
121     assertEquals("s1", srcSegs.get(0).getId());
122     assertEquals("s1", trgSegs.get(0).getId());
123     assertEquals("s2", srcSegs.get(1).getId());
124     assertEquals("s2", trgSegs.get(1).getId());
125   }
126 
127   @Test
128   void testPreserveInterSegmentSpace() {
129     ITextUnit src = createTextUnit("One. Two.", "s1", "s2");
130     ITextUnit trg = createTextUnit("Один. Два.", "t1", "t2");
131 
132     ITextUnit result = aligner.align(src, trg, srcLoc, trgLoc, false);
133 
134     List<TextPart> parts = result.getTarget(trgLoc).getParts();
135 
136     boolean foundSpace = false;
137     for (TextPart part : parts) {
138       if (!part.isSegment() && " ".equals(part.getContent().toText())) {
139         foundSpace = true;
140         break;
141       }
142     }
143     
144     assertTrue(foundSpace, "Inter-segment space should be preserved");
145   }
146 
147   @Test
148   void testNoSourceToTargetContentCopy() {
149     ITextUnit src = createTextUnit("Source only.", "s1");
150     ITextUnit trg = createTextUnit("Target only.", "t1");
151 
152     ITextUnit result = aligner.align(src, trg, srcLoc, trgLoc, false);
153 
154     String srcText = result.getSource().toString();
155     String trgText = result.getTarget(trgLoc).toString();
156 
157     assertTrue(srcText.contains("Source only."));
158     assertTrue(trgText.contains("Target only."));
159     assertFalse(trgText.contains("Source only."));
160     assertFalse(srcText.contains("Target only."));
161   }
162 
163   private static class FlexibleScorer implements AlignmentScorer<Segment> {
164     @SuppressWarnings("unused")
165     private LocaleId srcLoc, trgLoc;
166 
167     @Override
168     public void setLocales(LocaleId sourceLocale, LocaleId targetLocale) {
169       this.srcLoc = sourceLocale;
170       this.trgLoc = targetLocale;
171     }
172 
173     @Override
174     public int substitutionScore(Segment s, Segment t) {
175       if (s.getContent().toText().isEmpty() || t.getContent().toText().isEmpty()) {
176         return 0;
177       }
178       return 100;
179     }
180 
181     @Override
182     public int deletionScore(Segment s) {
183       return -50;
184     }
185 
186     @Override
187     public int insertionScore(Segment t) {
188       return -50;
189     }
190 
191     @Override
192     public int contractionScore(Segment s1, Segment s2, Segment t) {
193       if (s1.getContent().toText().isEmpty() || 
194           s2.getContent().toText().isEmpty() || 
195           t.getContent().toText().isEmpty()) {
196         return 0;
197       }
198       return 90;
199     }
200 
201     @Override
202     public int expansionScore(Segment s, Segment t1, Segment t2) {
203       if (s.getContent().toText().isEmpty() || 
204           t1.getContent().toText().isEmpty() || 
205           t2.getContent().toText().isEmpty()) {
206         return 0;
207       }
208       return 90;
209     }
210 
211     @Override
212     public int meldingScore(Segment s1, Segment s2, Segment t1, Segment t2) {
213       if (s1.getContent().toText().isEmpty() || 
214           s2.getContent().toText().isEmpty() ||
215           t1.getContent().toText().isEmpty() || 
216           t2.getContent().toText().isEmpty()) {
217         return 0;
218       }
219       return 180;
220     }
221   }
222 
223   private ITextUnit createTextUnit(String text, String... segIds) {
224     ITextUnit tu = new TextUnit("tu1");
225     TextContainer tc = tu.getSource();
226 
227     String[] sentences = text.split("(?<=[.!?])\\s+");
228 
229     tc.append(new TextPart(""));
230 
231     int textPos = 0;
232 
233     for (int i = 0; i < sentences.length; i++) {
234       String sentence = sentences[i];
235 
236       int start = text.indexOf(sentence, textPos);
237       int end = start + sentence.length();
238 
239       Segment seg = new Segment(segIds[i], new TextFragment(sentence));
240       tc.getSegments().append(seg);
241 
242       if (i < sentences.length - 1) {
243         String interSegment = text.substring(end, text.indexOf(sentences[i + 1], end));
244         tc.append(new TextPart(interSegment));
245       } else {
246         tc.append(new TextPart(""));
247       }
248 
249       textPos = end;
250     }
251 
252     tc.setHasBeenSegmentedFlag(true);
253     return tu;
254   }
255 }