View Javadoc
1   package com.acumenvelocity.ath.common;
2   
3   import java.io.IOException;
4   import java.io.StringReader;
5   import java.util.ArrayList;
6   import java.util.HashMap;
7   import java.util.HashSet;
8   import java.util.List;
9   import java.util.Map;
10  import java.util.Set;
11  
12  import org.apache.lucene.analysis.Analyzer;
13  import org.apache.lucene.analysis.TokenStream;
14  import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
15  import org.slf4j.Logger;
16  import org.slf4j.LoggerFactory;
17  
18  import com.google.common.io.Files;
19  
20  import net.sf.okapi.common.LocaleId;
21  import net.sf.okapi.common.MimeTypeMapper;
22  import net.sf.okapi.common.annotation.AltTranslation;
23  import net.sf.okapi.common.annotation.AltTranslationsAnnotation;
24  import net.sf.okapi.common.exceptions.OkapiException;
25  import net.sf.okapi.common.resource.Code;
26  import net.sf.okapi.common.resource.ISegments;
27  import net.sf.okapi.common.resource.ITextUnit;
28  import net.sf.okapi.common.resource.Segment;
29  import net.sf.okapi.common.resource.TextContainer;
30  import net.sf.okapi.common.resource.TextFragment;
31  import net.sf.okapi.common.resource.TextUnitUtil;
32  
33  public class OkapiUtil {
34  
35    /**
36     * Calculate Dice's Coefficient for two strings with tokens as ngrams.
37     *
38     * @param st1      first string to compare
39     * @param st2      second string to compare
40     * @param analyzer n-gram analyzer
41     * @return Dice's Coefficient as a float
42     */
43    public static float calculateNgramDiceCoefficient(String st1, String st2,
44        Analyzer analyzer) {
45  
46      Set<String> st1Tokens = new HashSet<>();
47      Set<String> st2Tokens = new HashSet<>();
48  
49      try (TokenStream ts1 = analyzer.tokenStream(null, new StringReader(st1))) {
50        ts1.reset();
51  
52        while (ts1.incrementToken()) {
53          st1Tokens.add(ts1.getAttribute(CharTermAttribute.class).toString());
54        }
55  
56        ts1.end();
57  
58      } catch (IOException e) {
59        throw new OkapiException("Error tokenizing source TextUnits", e);
60      }
61  
62      try (TokenStream ts2 = analyzer.tokenStream(null, new StringReader(st2))) {
63        ts2.reset();
64  
65        while (ts2.incrementToken()) {
66          st2Tokens.add(ts2.getAttribute(CharTermAttribute.class).toString());
67        }
68  
69        ts2.end();
70  
71      } catch (IOException e) {
72        throw new OkapiException("Error tokenizing source TextUnits", e);
73      }
74  
75      // Calculate the dice coefficient for the fuzzy score
76      int st1Size = st1Tokens.size();
77      int st2Size = st2Tokens.size();
78      st1Tokens.retainAll(st2Tokens);
79      int intersection = st1Tokens.size();
80      return ((2.0f * intersection)) / (st1Size + st2Size) * 100.0f;
81    }
82  
83    public static TextContainer safeGetTarget(ITextUnit tu, LocaleId trgLoc) {
84      try {
85        return (trgLoc != null) ? tu.getTarget(trgLoc) : null;
86  
87      } catch (Exception e) {
88        return null;
89      }
90    }
91  
92    /**
93     * Removes all inline tags in the source (or optionally the target) text unit resource.
94     * 
95     * @param textUnit          the given text unit
96     * @param removeTargetCodes - remove target codes?
97     */
98    public static void removeCodes(ITextUnit textUnit, boolean removeTargetCodes) {
99      Logger localLogger = LoggerFactory.getLogger(TextUnitUtil.class);
100     if (textUnit == null) {
101       localLogger.warn("Text unit is null.");
102       return;
103     }
104 
105     // remove source inline codes
106     TextContainer stc = textUnit.getSource();
107     removeCodes(stc);
108 
109     // if requested and if targets exist remove inline codes for all targets
110     if (removeTargetCodes && !textUnit.getTargetLocales().isEmpty()) {
111       for (LocaleId locale : textUnit.getTargetLocales()) {
112         TextContainer ttc = textUnit.getTarget(locale);
113         removeCodes(ttc);
114       }
115     }
116   }
117 
118   public static void removeCodes(TextContainer tc) {
119     ISegments segs = tc.getSegments();
120 
121     for (Segment seg : segs) {
122       TextUnitUtil.removeCodes(seg.text);
123     }
124   }
125 
126   /**
127    * Removes codes from the target TextFragment that don't exist in the source codes list.
128    * A code is considered to exist if there's a matching code in srcCodes with the same
129    * id and tagType pair.
130    * 
131    * @param srcCodes the source list of codes to compare against
132    * @param targetTf the target TextFragment from which to remove extra codes
133    */
134   public static void removeExtraCodes(List<Code> srcCodes, TextFragment targetTf) {
135     if (srcCodes == null || targetTf == null || !targetTf.hasCode()) {
136       return;
137     }
138 
139     // Get a copy of the target codes list to avoid concurrent modification
140     List<Code> trgCodes = new ArrayList<>(targetTf.getCodes());
141 
142     // Iterate through each target code
143     for (Code trgCode : trgCodes) {
144       boolean foundInSource = false;
145 
146       // Search for a matching code in source codes by id + tagType pair
147       for (Code srcCode : srcCodes) {
148         if (srcCode.getId() == trgCode.getId() &&
149             srcCode.getTagType() == trgCode.getTagType()) {
150           foundInSource = true;
151           break;
152         }
153       }
154 
155       // If no matching code found in source, remove it from target
156       if (!foundInSource) {
157         targetTf.removeCode(trgCode);
158       }
159     }
160   }
161 
162   public static void setAlOrigin(ITextUnit tu, LocaleId srcLoc, LocaleId trgLoc) {
163     TextContainer target = tu.getTarget(trgLoc);
164 
165     if (target == null) {
166       return;
167     }
168 
169     ISegments tsegs = target.getSegments();
170 
171     for (Segment tseg : tsegs) {
172       AltTranslationsAnnotation ata = tseg.getAnnotation(AltTranslationsAnnotation.class);
173 
174       if (ata == null) {
175         ata = new AltTranslationsAnnotation();
176       }
177 
178       // Mark for AL origin
179       AltTranslation at = new AltTranslation(srcLoc, trgLoc, tu,
180           Const.AL_MATCH_TYPE, 100, null);
181 
182       ata.add(at);
183       tseg.setAnnotation(ata);
184     }
185   }
186 
187   public static String getMimeType(String fileName) {
188     return MimeTypeMapper.getMimeType(Files.getFileExtension(fileName));
189   }
190 
191   public static void rearrangeCodes(List<Code> codes, TextFragment targetTf) {
192     if (codes == null || targetTf == null || !targetTf.hasCode()) {
193       return;
194     }
195 
196     // DO NOT call balanceMarkers() here - it would destroy malformed text
197     // before we can detect and fix it
198 
199     boolean needsRearrangement = true;
200     int maxIterations = 10; // Prevent infinite loops
201     int iteration = 0;
202 
203     while (needsRearrangement && iteration < maxIterations) {
204       needsRearrangement = false;
205       iteration++;
206 
207       String codedText = targetTf.getCodedText();
208 
209       // Build position map for current iteration
210       Map<Integer, Map<TextFragment.TagType, Integer>> positionMap = buildPositionMap(targetTf,
211           codedText);
212 
213       // Look for swapped pairs
214       List<Code> targetCodes = new ArrayList<>(targetTf.getCodes());
215       for (Code code : targetCodes) {
216         if (code.getTagType() == TextFragment.TagType.OPENING) {
217           int codeId = code.getId();
218 
219           // Get positions from map
220           Map<TextFragment.TagType, Integer> positions = positionMap.get(codeId);
221           if (positions == null)
222             continue;
223 
224           Integer openPos = positions.get(TextFragment.TagType.OPENING);
225           Integer closePos = positions.get(TextFragment.TagType.CLOSING);
226 
227           // Check if both exist and if closing comes before opening (wrong order)
228           if (openPos != null && closePos != null && closePos < openPos) {
229             // Swap the markers in the coded text
230             swapMarkers(targetTf, codedText, openPos, closePos);
231             needsRearrangement = true;
232             break; // Restart loop after making a change
233           }
234         }
235       }
236     }
237 
238     // DO NOT call balanceMarkers() here either - the setCodedText(..., true)
239     // calls within swapMarkers() have already synchronized the internal state
240   }
241 
242   /**
243    * Builds a map of code positions: codeId -> (TagType -> position)
244    */
245   private static Map<Integer, Map<TextFragment.TagType, Integer>> buildPositionMap(
246       TextFragment tf, String codedText) {
247 
248     Map<Integer, Map<TextFragment.TagType, Integer>> positionMap = new HashMap<>();
249 
250     for (int i = 0; i < codedText.length(); i++) {
251       char ch = codedText.charAt(i);
252       if (TextFragment.isMarker(ch)) {
253         int codeIndex = TextFragment.toIndex(codedText.charAt(i + 1));
254         Code code = tf.getCode(codeIndex);
255         int codeId = code.getId();
256 
257         // Get or create inner map for this code ID
258         Map<TextFragment.TagType, Integer> typeMap = positionMap.computeIfAbsent(
259             codeId, k -> new HashMap<>());
260 
261         // Store position for this tag type
262         typeMap.put(code.getTagType(), i);
263 
264         i++; // Skip the index character
265       }
266     }
267 
268     return positionMap;
269   }
270 
271   /**
272    * Swaps the opening and closing markers in the coded text
273    */
274   private static void swapMarkers(TextFragment targetTf, String codedText,
275       int openPos, int closePos) {
276 
277     // Extract the text segments
278     int closeMarkerEnd = closePos + 2;
279     int openMarkerStart = openPos;
280     int openMarkerEnd = openPos + 2;
281 
282     String beforeClose = codedText.substring(0, closePos);
283     String betweenCodes = codedText.substring(closeMarkerEnd, openMarkerStart);
284     String afterOpen = codedText.substring(openMarkerEnd);
285 
286     // Get the marker characters
287     char closeMarkerType = codedText.charAt(closePos);
288     char closeMarkerIndex = codedText.charAt(closePos + 1);
289     char openMarkerType = codedText.charAt(openPos);
290     char openMarkerIndex = codedText.charAt(openPos + 1);
291 
292     // Reconstruct with swapped positions: opening first, then closing
293     StringBuilder newCodedText = new StringBuilder();
294     newCodedText.append(beforeClose);
295     newCodedText.append(openMarkerType).append(openMarkerIndex);
296     newCodedText.append(betweenCodes);
297     newCodedText.append(closeMarkerType).append(closeMarkerIndex);
298     newCodedText.append(afterOpen);
299 
300     // Apply with allowCodeDeletion=true to synchronize internal state immediately
301     // This is CRITICAL - it updates code offsets without destructive balancing
302     targetTf.setCodedText(newCodedText.toString(), targetTf.getCodes(), true);
303   }
304 }