OkapiUtil

package com.acumenvelocity.ath.common;

import java.io.IOException;
import java.io.StringReader;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.google.common.io.Files;

import net.sf.okapi.common.LocaleId;
import net.sf.okapi.common.MimeTypeMapper;
import net.sf.okapi.common.annotation.AltTranslation;
import net.sf.okapi.common.annotation.AltTranslationsAnnotation;
import net.sf.okapi.common.exceptions.OkapiException;
import net.sf.okapi.common.resource.Code;
import net.sf.okapi.common.resource.ISegments;
import net.sf.okapi.common.resource.ITextUnit;
import net.sf.okapi.common.resource.Segment;
import net.sf.okapi.common.resource.TextContainer;
import net.sf.okapi.common.resource.TextFragment;
import net.sf.okapi.common.resource.TextUnitUtil;

public class OkapiUtil {

  /**
   * Calculate Dice's Coefficient for two strings with tokens as ngrams.
   *
   * @param st1      first string to compare
   * @param st2      second string to compare
   * @param analyzer n-gram analyzer
   * @return Dice's Coefficient as a float
   */
  public static float calculateNgramDiceCoefficient(String st1, String st2,
      Analyzer analyzer) {

    Set<String> st1Tokens = new HashSet<>();
    Set<String> st2Tokens = new HashSet<>();

    try (TokenStream ts1 = analyzer.tokenStream(null, new StringReader(st1))) {
      ts1.reset();

      while (ts1.incrementToken()) {
        st1Tokens.add(ts1.getAttribute(CharTermAttribute.class).toString());
      }

      ts1.end();

    } catch (IOException e) {
      throw new OkapiException("Error tokenizing source TextUnits", e);
    }

    try (TokenStream ts2 = analyzer.tokenStream(null, new StringReader(st2))) {
      ts2.reset();

      while (ts2.incrementToken()) {
        st2Tokens.add(ts2.getAttribute(CharTermAttribute.class).toString());
      }

      ts2.end();

    } catch (IOException e) {
      throw new OkapiException("Error tokenizing source TextUnits", e);
    }

    // Calculate the dice coefficient for the fuzzy score
    int st1Size = st1Tokens.size();
    int st2Size = st2Tokens.size();
    st1Tokens.retainAll(st2Tokens);
    int intersection = st1Tokens.size();
    return ((2.0f * intersection)) / (st1Size + st2Size) * 100.0f;
  }

  public static TextContainer safeGetTarget(ITextUnit tu, LocaleId trgLoc) {
    try {
      return (trgLoc != null) ? tu.getTarget(trgLoc) : null;

    } catch (Exception e) {
      return null;
    }
  }

  /**
   * Removes all inline tags in the source (or optionally the target) text unit resource.
   *
   * @param textUnit          the given text unit
   * @param removeTargetCodes - remove target codes?
   */
  public static void removeCodes(ITextUnit textUnit, boolean removeTargetCodes) {
    Logger localLogger = LoggerFactory.getLogger(TextUnitUtil.class);
    if (textUnit == null) {
      localLogger.warn("Text unit is null.");
      return;
    }

    // remove source inline codes
    TextContainer stc = textUnit.getSource();
    removeCodes(stc);

    // if requested and if targets exist remove inline codes for all targets
    if (removeTargetCodes && !textUnit.getTargetLocales().isEmpty()) {
      for (LocaleId locale : textUnit.getTargetLocales()) {
        TextContainer ttc = textUnit.getTarget(locale);
        removeCodes(ttc);
      }
    }
  }

  public static void removeCodes(TextContainer tc) {
    ISegments segs = tc.getSegments();

    for (Segment seg : segs) {
      TextUnitUtil.removeCodes(seg.text);
    }
  }

  /**
   * Removes codes from the target TextFragment that don't exist in the source codes list.
   * A code is considered to exist if there's a matching code in srcCodes with the same
   * id and tagType pair.
   *
   * @param srcCodes the source list of codes to compare against
   * @param targetTf the target TextFragment from which to remove extra codes
   */
  public static void removeExtraCodes(List<Code> srcCodes, TextFragment targetTf) {
    if (srcCodes == null || targetTf == null || !targetTf.hasCode()) {
      return;
    }

    // Get a copy of the target codes list to avoid concurrent modification
    List<Code> trgCodes = new ArrayList<>(targetTf.getCodes());

    // Iterate through each target code
    for (Code trgCode : trgCodes) {
      boolean foundInSource = false;

      // Search for a matching code in source codes by id + tagType pair
      for (Code srcCode : srcCodes) {
        if (srcCode.getId() == trgCode.getId() &&
            srcCode.getTagType() == trgCode.getTagType()) {
          foundInSource = true;
          break;
        }
      }

      // If no matching code found in source, remove it from target
      if (!foundInSource) {
        targetTf.removeCode(trgCode);
      }
    }
  }

  public static void setAlOrigin(ITextUnit tu, LocaleId srcLoc, LocaleId trgLoc) {
    TextContainer target = tu.getTarget(trgLoc);

    if (target == null) {
      return;
    }

    ISegments tsegs = target.getSegments();

    for (Segment tseg : tsegs) {
      AltTranslationsAnnotation ata = tseg.getAnnotation(AltTranslationsAnnotation.class);

      if (ata == null) {
        ata = new AltTranslationsAnnotation();
      }

      // Mark for AL origin
      AltTranslation at = new AltTranslation(srcLoc, trgLoc, tu,
          Const.AL_MATCH_TYPE, 100, null);

      ata.add(at);
      tseg.setAnnotation(ata);
    }
  }

  public static String getMimeType(String fileName) {
    return MimeTypeMapper.getMimeType(Files.getFileExtension(fileName));
  }

  public static void rearrangeCodes(List<Code> codes, TextFragment targetTf) {
    if (codes == null || targetTf == null || !targetTf.hasCode()) {
      return;
    }

    // DO NOT call balanceMarkers() here - it would destroy malformed text
    // before we can detect and fix it

    boolean needsRearrangement = true;
    int maxIterations = 10; // Prevent infinite loops
    int iteration = 0;

    while (needsRearrangement && iteration < maxIterations) {
      needsRearrangement = false;
      iteration++;

      String codedText = targetTf.getCodedText();

      // Build position map for current iteration
      Map<Integer, Map<TextFragment.TagType, Integer>> positionMap = buildPositionMap(targetTf,
          codedText);

      // Look for swapped pairs
      List<Code> targetCodes = new ArrayList<>(targetTf.getCodes());
      for (Code code : targetCodes) {
        if (code.getTagType() == TextFragment.TagType.OPENING) {
          int codeId = code.getId();

          // Get positions from map
          Map<TextFragment.TagType, Integer> positions = positionMap.get(codeId);
          if (positions == null)
            continue;

          Integer openPos = positions.get(TextFragment.TagType.OPENING);
          Integer closePos = positions.get(TextFragment.TagType.CLOSING);

          // Check if both exist and if closing comes before opening (wrong order)
          if (openPos != null && closePos != null && closePos < openPos) {
            // Swap the markers in the coded text
            swapMarkers(targetTf, codedText, openPos, closePos);
            needsRearrangement = true;
            break; // Restart loop after making a change
          }
        }
      }
    }

    // DO NOT call balanceMarkers() here either - the setCodedText(..., true)
    // calls within swapMarkers() have already synchronized the internal state
  }

  /**
   * Builds a map of code positions: codeId -> (TagType -> position)
   */
  private static Map<Integer, Map<TextFragment.TagType, Integer>> buildPositionMap(
      TextFragment tf, String codedText) {

    Map<Integer, Map<TextFragment.TagType, Integer>> positionMap = new HashMap<>();

    for (int i = 0; i < codedText.length(); i++) {
      char ch = codedText.charAt(i);
      if (TextFragment.isMarker(ch)) {
        int codeIndex = TextFragment.toIndex(codedText.charAt(i + 1));
        Code code = tf.getCode(codeIndex);
        int codeId = code.getId();

        // Get or create inner map for this code ID
        Map<TextFragment.TagType, Integer> typeMap = positionMap.computeIfAbsent(
            codeId, k -> new HashMap<>());

        // Store position for this tag type
        typeMap.put(code.getTagType(), i);

        i++; // Skip the index character
      }
    }

    return positionMap;
  }

  /**
   * Swaps the opening and closing markers in the coded text
   */
  private static void swapMarkers(TextFragment targetTf, String codedText,
      int openPos, int closePos) {

    // Extract the text segments
    int closeMarkerEnd = closePos + 2;
    int openMarkerStart = openPos;
    int openMarkerEnd = openPos + 2;

    String beforeClose = codedText.substring(0, closePos);
    String betweenCodes = codedText.substring(closeMarkerEnd, openMarkerStart);
    String afterOpen = codedText.substring(openMarkerEnd);

    // Get the marker characters
    char closeMarkerType = codedText.charAt(closePos);
    char closeMarkerIndex = codedText.charAt(closePos + 1);
    char openMarkerType = codedText.charAt(openPos);
    char openMarkerIndex = codedText.charAt(openPos + 1);

    // Reconstruct with swapped positions: opening first, then closing
    StringBuilder newCodedText = new StringBuilder();
    newCodedText.append(beforeClose);
    newCodedText.append(openMarkerType).append(openMarkerIndex);
    newCodedText.append(betweenCodes);
    newCodedText.append(closeMarkerType).append(closeMarkerIndex);
    newCodedText.append(afterOpen);

    // Apply with allowCodeDeletion=true to synchronize internal state immediately
    // This is CRITICAL - it updates code offsets without destructive balancing
    targetTf.setCodedText(newCodedText.toString(), targetTf.getCodes(), true);
  }
}