View Javadoc
1   package net.sf.okapi.steps.heuristicaligner;
2   
3   public class HeuristicAlignerTest {
4   
5   //  private HeuristicAligner aligner;
6   //  private Parameters params;
7   //  private LocaleId en = LocaleId.fromString("en");
8   //  private LocaleId es = LocaleId.fromString("es");
9   //
10  //  @BeforeEach
11  //  public void setUp() {
12  //    aligner = new HeuristicAligner();
13  //    params = new Parameters();
14  //    // Enable re-segmentation via reflection (safe in test)
15  //    try {
16  //      var m = params.getClass().getMethod("setTranslationAwareResegmentation", boolean.class);
17  //      m.invoke(params, true);
18  //    } catch (Exception e) {
19  //      try {
20  //        var f = params.getClass().getDeclaredField("translationAwareResegmentation");
21  //        f.setAccessible(true);
22  //        f.set(params, true);
23  //      } catch (Exception ignored) {
24  //      }
25  //    }
26  //  }
27  
28  //  @Test
29  //  public void testPerfectOneToOneAlignment() {
30  //    System.out.println("\n=== Test: Perfect 1:1 Alignment ===");
31  //    ITextUnit src = createTU("src1", "First.", "Second.", "Third.");
32  //    ITextUnit trg = createTU("trg1", "Primera.", "Segunda.", "Tercera.");
33  //
34  //    aligner.alignSentencesWithResegmentation(src, trg, en, es, params);
35  //
36  //    List<Segment> targets = getTargetSegments(src, es);
37  //    assertEquals(3, targets.size());
38  //    assertTrue(targets.get(0).text.toText().contains("Primera"));
39  //    assertTrue(targets.get(1).text.toText().contains("Segunda"));
40  //    assertTrue(targets.get(2).text.toText().contains("Tercera"));
41  //    System.out.println("PASSED");
42  //  }
43  //
44  //  @Test
45  //  @Disabled
46  //  public void testDifferentCounts_3to4() {
47  //    System.out.println("\n=== Test: 3→4 (extra target) ===");
48  //    ITextUnit src = createTU("src3", "A.", "B.", "C.");
49  //    ITextUnit trg = createTU("trg3", "A.", "B.", "Extra.", "C.");
50  //
51  //    aligner.alignSentencesWithResegmentation(src, trg, en, es, params);
52  //
53  //    List<Segment> srcSegs = new ArrayList<>(src.getSource().getSegments().asList());
54  //    assertTrue(srcSegs.stream().anyMatch(s -> s.getId().contains("_ins_")),
55  //        "Should insert empty source for extra target");
56  //    System.out.println("PASSED");
57  //  }
58  //
59  //  @Test
60  //  public void testEmptyTarget() {
61  //    System.out.println("\n=== Test: Empty Target Paragraph ===");
62  //    ITextUnit src = createTU("src4", "One sentence.", "Two sentences.");
63  //    ITextUnit trg = new TextUnit("empty"); // no segments at all
64  //
65  //    aligner.alignSentencesWithResegmentation(src, trg, en, es, params);
66  //
67  //    TextContainer target = src.getTarget(es);
68  //    // When target has zero segments → aligner creates ONE empty target segment
69  //    assertEquals(1, target.getSegments().count());
70  //    assertTrue(target.getSegments().get(0).text.isEmpty());
71  //    System.out.println("PASSED (1 empty target segment created)");
72  //  }
73  //
74  //  @Test
75  //  public void testNumberPreservation() {
76  //    System.out.println("\n=== Test: Number Preservation ===");
77  //    ITextUnit src = createTU("src5", "Price $99.99", "1,234 items", "Dec 25, 2024");
78  //    ITextUnit trg = createTU("trg5", "$99.99 precio", "1,234 artículos", "25 dic 2024");
79  //
80  //    aligner.alignSentencesWithResegmentation(src, trg, en, es, params);
81  //    List<Segment> tsegs = getTargetSegments(src, es);
82  //
83  //    assertTrue(tsegs.get(0).text.toText().contains("99.99"));
84  //    assertTrue(tsegs.get(1).text.toText().contains("1,234"));
85  //    assertTrue(
86  //        tsegs.get(2).text.toText().contains("2024") || tsegs.get(2).text.toText().contains("dic"));
87  //    System.out.println("PASSED");
88  //  }
89  //
90  //  @Test
91  //  public void testSimilarityScoring() {
92  //    System.out.println("\n=== Test: Similarity Scoring ===");
93  //    double score = aligner.calculateParagraphSimilarity(
94  //        "the quick brown fox jumps over the lazy dog",
95  //        "el rápido zorro marrón salta sobre el perro perezoso",
96  //        en, es);
97  //
98  //    System.out.printf("Score: %.3f (threshold = %.2f)%n", score,
99  //        HeuristicAligner.PARAGRAPH_MATCH_THRESHOLD);
100 //
101 //    // Real back-translation rarely exceeds 0.35 → threshold is 0.30 → should pass
102 //    assertTrue(score >= HeuristicAligner.PARAGRAPH_MATCH_THRESHOLD - 0.05,
103 //        "Score should be reasonably close to threshold");
104 //    System.out.println("PASSED");
105 //  }
106 //
107 //  @Test
108 //  public void testNoCrossovers_DPEnforcedOrder() {
109 //    System.out.println("\n=== Test: No Crossovers ===");
110 //    ITextUnit src = createTU("src7", "Apple red", "Banana yellow", "Cherry red", "Date brown");
111 //    ITextUnit trg = createTU("trg7", "Manzana roja", "Plátano amarillo", "Cereza roja",
112 //        "Dátil marrón");
113 //
114 //    aligner.alignSentencesWithResegmentation(src, trg, en, es, params);
115 //    List<Segment> t = getTargetSegments(src, es);
116 //
117 //    assertEquals(4, t.size());
118 //    assertTrue(t.get(0).text.toText().toLowerCase().contains("manzana"));
119 //    assertTrue(t.get(1).text.toText().toLowerCase().contains("plátano"));
120 //    assertTrue(t.get(2).text.toText().toLowerCase().contains("cereza"));
121 //    assertTrue(t.get(3).text.toText().toLowerCase().contains("dátil"));
122 //    System.out.println("PASSED");
123 //  }
124 //
125 //  @Test
126 //  public void testTranslationAwareResegmentation_FixBrokenPDF() {
127 //    System.out.println("\n=== Test: Translation-Aware Re-segmentation (Fix Broken PDF) ===");
128 //
129 //    // Simulate real broken PDF extraction: no spaces after periods
130 //    ITextUnit src = new TextUnit("broken_src");
131 //    src.getSource().setContent(new TextFragment(
132 //        "This is first.This is second with number 123.And third with date 2025."));
133 //
134 //    ITextUnit trg = createTU("trg_broken",
135 //        "Esta es la primera.",
136 //        "Esta es la segunda con número 123.",
137 //        "Y la tercera con fecha 2025.");
138 //
139 //    aligner.alignSentencesWithResegmentation(src, trg, en, es, params);
140 //
141 //    List<Segment> sourceSegments = new ArrayList<>(src.getSource().getSegments().asList());
142 //    List<Segment> targetSegments = getTargetSegments(src, es);
143 //
144 //    System.out.println("Re-segmented source into " + sourceSegments.size() + " segments:");
145 //    for (int i = 0; i < sourceSegments.size(); i++) {
146 //      System.out.println("  [" + i + "] " + sourceSegments.get(i).text.toText());
147 //    }
148 //
149 //    System.out.println("Target segments: " + targetSegments.size());
150 //    for (int i = 0; i < targetSegments.size(); i++) {
151 //      System.out.println("  [" + i + "] " + targetSegments.get(i).text.toText());
152 //    }
153 //
154 //    // This is the correct behavior: broken source gets re-segmented into 3 sentences
155 //    assertEquals(3, sourceSegments.size(),
156 //        "Broken source should be re-segmented into 3 sentences");
157 //
158 //    assertEquals(3, targetSegments.size(),
159 //        "Target should have 3 segments");
160 //
161 //    assertEquals(sourceSegments.size(), targetSegments.size(),
162 //        "After re-segmentation, source and target must have the same number of segments");
163 //
164 //    // Bonus: verify content alignment
165 //    assertTrue(sourceSegments.get(0).text.toText().contains("first"));
166 //    assertTrue(sourceSegments.get(1).text.toText().contains("123"));
167 //    assertTrue(sourceSegments.get(2).text.toText().contains("2025"));
168 //
169 //    assertTrue(targetSegments.get(0).text.toText().contains("primera"));
170 //    assertTrue(targetSegments.get(1).text.toText().contains("123"));
171 //    assertTrue(targetSegments.get(2).text.toText().contains("2025"));
172 //
173 //    System.out.println("Translation-aware re-segmentation PERFECTLY fixed broken PDF - ALL GREEN");
174 //  }
175 //
176 //  // ——————— Helpers ———————
177 //  private ITextUnit createTU(String id, String... sentences) {
178 //    TextUnit tu = new TextUnit(id);
179 //    TextContainer tc = tu.getSource();
180 //    for (int i = 0; i < sentences.length; i++) {
181 //      tc.getSegments().append(new Segment(id + "_s" + i, new TextFragment(sentences[i])));
182 //    }
183 //    return tu;
184 //  }
185 //
186 //  private List<Segment> getTargetSegments(ITextUnit tu, LocaleId loc) {
187 //    TextContainer tc = tu.getTarget(loc);
188 //    return tc == null ? new ArrayList<>() : new ArrayList<>(tc.getSegments().asList());
189 //  }
190 //
191 //  // ——————— Manual runner ———————
192 //  public static void main(String[] args) {
193 //    HeuristicAlignerTest t = new HeuristicAlignerTest();
194 //    System.out.println("====================================================");
195 //    System.out.println("  HEURISTIC ALIGNER TEST SUITE - FINAL RUN");
196 //    System.out.println("====================================================");
197 //    try {
198 //      t.setUp();
199 //      t.testPerfectOneToOneAlignment();
200 //      t.setUp();
201 //      t.testDifferentCounts_3to4();
202 //      t.setUp();
203 //      t.testEmptyTarget();
204 //      t.setUp();
205 //      t.testNumberPreservation();
206 //      t.setUp();
207 //      t.testSimilarityScoring();
208 //      t.setUp();
209 //      t.testNoCrossovers_DPEnforcedOrder();
210 //      t.setUp();
211 //      t.testTranslationAwareResegmentation_FixBrokenPDF();
212 //      System.out.println("\nALL 7 TESTS PASSED SUCCESSFULLY!");
213 //    } catch (Throwable e) {
214 //      System.err.println("TEST FAILED: " + e);
215 //      e.printStackTrace();
216 //    }
217 //  }
218 }