View Javadoc
1   package com.acumenvelocity.ath.common;
2   
3   import java.util.ArrayList;
4   import java.util.HashMap;
5   import java.util.List;
6   import java.util.Map;
7   import java.util.Objects;
8   
9   import com.acumenvelocity.ath.model.InlineCode;
10  import com.acumenvelocity.ath.model.InlineCodeRef;
11  import com.acumenvelocity.ath.model.LayeredSegment;
12  import com.acumenvelocity.ath.model.MtTargetInfo;
13  import com.acumenvelocity.ath.model.x.LayeredTextX;
14  import com.acumenvelocity.ath.mt.confidence.ConfidenceScoredTranslation;
15  
16  import net.sf.okapi.common.LocaleId;
17  import net.sf.okapi.common.annotation.AltTranslation;
18  import net.sf.okapi.common.annotation.AltTranslationsAnnotation;
19  import net.sf.okapi.common.resource.Code;
20  import net.sf.okapi.common.resource.TextContainer;
21  import net.sf.okapi.common.resource.TextFragment;
22  import net.sf.okapi.common.resource.TextFragment.TagType;
23  
24  public class ConversionUtil {
25  
26    /**
27     * Converts a LayeredText to a TextFragment (Okapi format).
28     * 
29     * @param fs the LayeredText to convert
30     * @return TextFragment representation, or null if input is null
31     */
32    public static TextFragment toTextFragment(LayeredTextX fs) {
33      if (fs == null) {
34        return null;
35      }
36  
37      // Start with the raw text
38      String text = fs.getText();
39      if (text == null) {
40        text = "";
41      }
42  
43      TextFragment tf = new TextFragment();
44      List<InlineCode> inlineCodes = fs.getCodes();
45  
46      if (inlineCodes == null || inlineCodes.isEmpty()) {
47        // No codes, just append the text
48        tf.append(text);
49        return tf;
50      }
51  
52      // Sort codes by position to process them in order
53      List<InlineCode> sortedCodes = new ArrayList<>(inlineCodes);
54      sortedCodes.sort((a, b) -> Integer.compare(a.getPosition(), b.getPosition()));
55  
56      int currentPos = 0;
57      int textLength = text.length();
58  
59      for (InlineCode ic : sortedCodes) {
60        int codePos = ic.getPosition();
61  
62        // Validate code position
63        if (codePos < 0) {
64          Log.warn(ConversionUtil.class,
65              "Invalid code position {} (negative) for code id={}, skipping",
66              codePos, ic.getId());
67  
68          continue;
69        }
70  
71        if (codePos > textLength) {
72          Log.warn(ConversionUtil.class,
73              "Invalid code position {} (exceeds text length {}) for code id={}, clamping to end",
74              codePos, textLength, ic.getId());
75  
76          codePos = textLength;
77        }
78  
79        // Append text before this code
80        if (codePos > currentPos) {
81  
82          // Additional safety check
83          if (currentPos < textLength) {
84            int endPos = Math.min(codePos, textLength);
85            tf.append(text.substring(currentPos, endPos));
86          }
87        }
88  
89        // Convert TagType enum
90        TagType tagType = ic.getTagType();
91  
92        // Create and append the code
93        Code code = tf.append(
94            tagType,
95            ic.getType() != null ? ic.getType() : "",
96            ic.getData() != null ? ic.getData() : "");
97  
98        // Set additional properties
99        if (ic.getId() != null) {
100         code.setId(ic.getId());
101       }
102 
103       if (ic.getOuterData() != null && !ic.getOuterData().isEmpty()) {
104         code.setOuterData(ic.getOuterData());
105       }
106 
107       if (ic.getDisplayText() != null) {
108         code.setDisplayText(ic.getDisplayText());
109       }
110 
111       if (ic.getOriginalId() != null) {
112         code.setOriginalId(ic.getOriginalId());
113       }
114 
115       // Set flags
116       if (ic.getFlag() != null) {
117         int flag = ic.getFlag();
118 
119         if ((flag & 0x01) != 0) {
120           code.setReferenceFlag(true);
121         }
122 
123         if ((flag & 0x02) != 0) {
124           code.setCloneable(true);
125         }
126 
127         if ((flag & 0x04) != 0) {
128           code.setDeleteable(true);
129         }
130       }
131 
132       currentPos = codePos;
133     }
134 
135     // Append remaining text after last code
136     if (currentPos < textLength) {
137       tf.append(text.substring(currentPos));
138     }
139 
140     return tf;
141   }
142 
143   /**
144    * Converts a TextFragment (Okapi format) to a LayeredText.
145    * 
146    * @param tf the TextFragment to convert
147    * @return LayeredText representation, or null if input is null
148    */
149   public static LayeredTextX toLayeredText(TextFragment tf, LocaleId loc) {
150     if (tf == null) {
151       return null;
152     }
153 
154     LayeredTextX fs = new LayeredTextX();
155 
156     if (loc != null) {
157       fs.setLanguage(loc.toString());
158     }
159 
160     // Set the plain text (without codes)
161     fs.setText(tf.getText());
162 
163     // Set text with codes for reference
164     fs.setTextWithCodes(tf.toText());
165 
166     List<InlineCode> inlineCodes = new ArrayList<>();
167 
168     if (tf.hasCode()) {
169       String codedText = tf.getCodedText();
170 
171       int textPosition = 0; // Position in plain text
172 
173       for (int i = 0; i < codedText.length(); i++) {
174         char ch = codedText.charAt(i);
175 
176         if (TextFragment.isMarker(ch)) {
177           // This is a code marker
178           char indexChar = codedText.charAt(++i);
179           Code code = tf.getCode(indexChar);
180 
181           InlineCode ic = new InlineCode();
182           ic.setPosition(textPosition);
183           ic.setId(code.getId());
184           ic.setType(code.getType());
185           ic.setData(code.getData());
186 
187           // Convert TagType
188           ic.setTagType(code.getTagType());
189 
190           // Set additional properties
191           if (code.getOuterData() != null && !code.getOuterData().isEmpty()) {
192             ic.setOuterData(code.getOuterData());
193           }
194 
195           if (code.getDisplayText() != null) {
196             ic.setDisplayText(code.getDisplayText());
197           }
198 
199           if (code.getOriginalId() != null) {
200             ic.setOriginalId(code.getOriginalId());
201           }
202 
203           // Convert flags
204           int flag = 0;
205           if (code.hasReference())
206             flag |= 0x01;
207 
208           if (code.isCloneable())
209             flag |= 0x02;
210 
211           if (code.isDeleteable())
212             flag |= 0x04;
213 
214           ic.setFlag(flag);
215 
216           inlineCodes.add(ic);
217 
218         } else {
219           // Regular character
220           textPosition++;
221         }
222       }
223     }
224 
225     fs.setCodes(inlineCodes);
226 
227     return fs;
228   }
229 
230   public static LayeredSegment toLayeredSegment(LayeredTextX slt, LayeredTextX tlt) {
231     LayeredSegment lseg = new LayeredSegment();
232 
233     lseg.setSrcLang(slt.getLanguage());
234     lseg.setSrcText(slt.getText());
235 
236     lseg.setTrgLang(tlt.getLanguage());
237     lseg.setTrgText(tlt.getText());
238 
239     // Source codes
240     List<InlineCodeRef> codeRefs = new ArrayList<>();
241 
242     for (InlineCode c : slt.getCodes()) {
243       InlineCodeRef cr = new InlineCodeRef();
244 
245       cr.setId(c.getId());
246       cr.setTagType(c.getTagType());
247       cr.setPosition(c.getPosition());
248 
249       codeRefs.add(cr);
250     }
251 
252     lseg.setSrcCodes(codeRefs);
253 
254     // Target codes
255     codeRefs = new ArrayList<>();
256 
257     for (InlineCode c : tlt.getCodes()) {
258       InlineCodeRef cr = new InlineCodeRef();
259 
260       cr.setId(c.getId());
261       cr.setTagType(c.getTagType());
262       cr.setPosition(c.getPosition());
263 
264       codeRefs.add(cr);
265     }
266 
267     lseg.setTrgCodes(codeRefs);
268 
269     return lseg;
270   }
271 
272   /**
273    * Converts a LayeredSegment to a LayeredTextX by resolving inline code references.
274    * 
275    * <p>
276    * This method takes the target text and code references from a LayeredSegment and
277    * matches them with full InlineCode objects from a source LayeredTextX. The code
278    * references (InlineCodeRef) contain only the id, position, and tagType, while the
279    * full InlineCode objects contain additional metadata like data, outerData, type,
280    * flags, etc.
281    * </p>
282    * 
283    * <p>
284    * InlineCode objects are uniquely identified by the combination of their id and
285    * tagType fields, as ids alone may not be unique (e.g., opening and closing tags
286    * may share the same id but have different tagTypes).
287    * </p>
288    * 
289    * @param lseg the LayeredSegment containing target text and code references to resolve
290    * @param slt  the source LayeredTextX containing the full InlineCode objects to match against
291    * @return a new LayeredTextX with the target language and text from lseg, and codes
292    *         resolved from slt with positions updated from the code references. Returns
293    *         null if either parameter is null.
294    */
295   public static LayeredTextX tltFromLayeredSegment(LayeredSegment lseg, LayeredTextX slt) {
296     if (lseg == null || slt == null) {
297       return null;
298     }
299 
300     LayeredTextX result = new LayeredTextX();
301 
302     // Set language and text from the target side of the segment
303     result.setLanguage(lseg.getTrgLang());
304     result.setText(lseg.getTrgText());
305 
306     int maxPos = lseg.getTrgText().length() == 0 ? 0 : lseg.getTrgText().length() - 1;
307 
308     // Get code references from the segment's target
309     List<InlineCodeRef> codeRefs = lseg.getTrgCodes();
310 
311     // Get full code objects from the source LayeredTextX
312     List<InlineCode> sourceCodes = slt.getCodes();
313 
314     // Create a composite key class for unique identification
315     class CodeKey {
316       final Integer id;
317       final TagType tagType;
318 
319       CodeKey(Integer id, TagType tagType) {
320         this.id = id;
321         this.tagType = tagType;
322       }
323 
324       @Override
325       public boolean equals(Object o) {
326         if (this == o)
327           return true;
328 
329         if (o == null || getClass() != o.getClass())
330           return false;
331 
332         CodeKey codeKey = (CodeKey) o;
333         return Objects.equals(id, codeKey.id) && tagType == codeKey.tagType;
334       }
335 
336       @Override
337       public int hashCode() {
338         return Objects.hash(id, tagType);
339       }
340     }
341 
342     // Create a map for quick lookup of codes by id and tagType
343     Map<CodeKey, InlineCode> codeMap = new HashMap<>();
344     if (sourceCodes != null) {
345       for (InlineCode code : sourceCodes) {
346         codeMap.put(new CodeKey(code.getId(), code.getTagType()), code);
347       }
348     }
349 
350     // Build the result codes list by matching references to full codes
351     List<InlineCode> resultCodes = new ArrayList<>();
352 
353     if (codeRefs != null) {
354       for (InlineCodeRef ref : codeRefs) {
355         InlineCode fullCode = codeMap.get(new CodeKey(ref.getId(), ref.getTagType()));
356 
357         if (fullCode != null) {
358           // Create a new InlineCode with updated position from the reference
359           InlineCode newCode = new InlineCode();
360 
361           newCode.setId(fullCode.getId());
362 
363           if (ref.getPosition() > maxPos) {
364             Log.warn(ConversionUtil.class,
365                 "Code position from LLM exceeds the length of '{}', normalizing",
366                 lseg.getTrgText());
367 
368             newCode.setPosition(maxPos);
369 
370           } else {
371             newCode.setPosition(ref.getPosition());
372           }
373 
374           newCode.setType(fullCode.getType());
375           newCode.setTagType(ref.getTagType());
376           newCode.setData(fullCode.getData());
377           newCode.setOuterData(fullCode.getOuterData());
378           newCode.setFlag(fullCode.getFlag());
379           newCode.setDisplayText(fullCode.getDisplayText());
380           newCode.setOriginalId(fullCode.getOriginalId());
381 
382           resultCodes.add(newCode);
383         }
384       }
385     }
386 
387     result.setCodes(resultCodes);
388 
389     return result;
390   }
391 
392   public static List<MtTargetInfo> toMtTargets(AltTranslationsAnnotation ata) {
393     List<MtTargetInfo> altTrans = new ArrayList<>();
394 
395     for (AltTranslation at : ata) {
396       if (at instanceof ConfidenceScoredTranslation) {
397         ConfidenceScoredTranslation cst = (ConfidenceScoredTranslation) at;
398 
399         MtTargetInfo mti = new MtTargetInfo();
400 
401         TextContainer target = cst.getTarget();
402         TextFragment ttf = target.getFirstContent();
403         LayeredTextX tlt = ConversionUtil.toLayeredText(ttf, cst.getTargetLocale());
404 
405         mti.setMtTarget(tlt);
406         mti.setMtConfidenceScore(cst.getConfidenceScore());
407         mti.setMtModelId(cst.getOrigin());
408         
409         altTrans.add(mti);
410       }
411     }
412 
413     return altTrans;
414   }
415 }