BaseAlignerStep

package com.acumenvelocity.ath.steps;

import java.util.ArrayList;
import java.util.List;

import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import net.sf.okapi.common.Event;
import net.sf.okapi.common.IParameters;
import net.sf.okapi.common.ISegmenter;
import net.sf.okapi.common.StringUtil;
import net.sf.okapi.common.filters.IFilter;
import net.sf.okapi.common.pipeline.annotations.StepParameterMapping;
import net.sf.okapi.common.pipeline.annotations.StepParameterType;
import net.sf.okapi.common.resource.ITextUnit;
import net.sf.okapi.common.resource.RawDocument;
import net.sf.okapi.common.resource.TextPart;
import net.sf.okapi.lib.segmentation.SRXDocument;

/**
 * Base class for sentence alignment steps that align source and target documents
 * at both paragraph and sentence levels.
 */
public abstract class BaseAlignerStep extends BaseTuBatchProcessingStep {

  protected final Logger LOGGER = LoggerFactory.getLogger(getClass());

  private IFilter targetFilter;
  private RawDocument targetInput = null;
  private ISegmenter sourceSegmenter;
  private ISegmenter targetSegmenter;

  private final List<ITextUnit> sourceTUs = new ArrayList<>();
  private final List<ITextUnit> targetTUs = new ArrayList<>();

  public BaseAlignerStep(IFilter targetFilter) {
    super();
    this.targetFilter = targetFilter;
  }

  @StepParameterMapping(parameterType = StepParameterType.SECOND_INPUT_RAWDOC)
  public void setSecondInput(RawDocument secondInput) {
    this.targetInput = secondInput;
  }

  /**
   * Get the parameters object for this aligner.
   * Must return a parameters class that implements the required parameter methods.
   */
  @Override
  public abstract IParameters getParameters();

  /**
   * Check if source segmentation is enabled.
   */
  protected abstract boolean isSegmentSource();

  /**
   * Check if target segmentation is enabled.
   */
  protected abstract boolean isSegmentTarget();

  /**
   * Check if custom source segmentation rules should be used.
   */
  protected abstract boolean isUseCustomSourceRules();

  /**
   * Check if custom target segmentation rules should be used.
   */
  protected abstract boolean isUseCustomTargetRules();

  /**
   * Get the path to custom source segmentation rules.
   */
  protected abstract String getCustomSourceRulesPath();

  /**
   * Get the path to custom target segmentation rules.
   */
  protected abstract String getCustomTargetRulesPath();

  /**
   * Check if whitespace should be collapsed.
   */
  protected abstract boolean isCollapseWhitespace();

  /**
   * Perform the actual alignment logic.
   * Called after all source and target TUs have been collected.
   */
  protected abstract void performAlignment(List<ITextUnit> sourceTUs, List<ITextUnit> targetTUs);

  @Override
  protected Event handleStartBatch(Event event) {
    boolean loadDefault = true;
    SRXDocument srxDocument = new SRXDocument();

    // Prepare source segmentation if needed
    if (isSegmentSource()) {
      if (isUseCustomSourceRules()) {
        try {
          srxDocument.loadRules(getCustomSourceRulesPath());
          loadDefault = false;

        } catch (Exception e) {
          LOGGER.warn(
              "Custom source segmentation rules file '{}' cannot be read. Using the default rules instead.",
              getCustomSourceRulesPath());
        }
      }

      if (loadDefault) {
        srxDocument.loadRules(SRXDocument.DEFAULT_SRX_RULES);
      }

      sourceSegmenter = srxDocument.compileLanguageRules(getSourceLocale(), null);
    }

    // Prepare target segmentation if needed
    if (isSegmentTarget()) {
      loadDefault = true;

      if (isUseCustomTargetRules()) {
        try {
          srxDocument.loadRules(getCustomTargetRulesPath());
          loadDefault = false;

        } catch (Exception e) {
          LOGGER.warn(
              "Custom target segmentation rules file '{}' cannot be read. Using the default rules instead.",
              getCustomTargetRulesPath());
        }
      }

      if (loadDefault) {
        srxDocument.loadRules(SRXDocument.DEFAULT_SRX_RULES);
      }

      targetSegmenter = srxDocument.compileLanguageRules(getTargetLocale(), null);
    }

    return event;
  }

  @Override
  protected Event handleStartDocument(Event event) {
    sourceTUs.clear();
    targetTUs.clear();

    if (targetInput != null) {
      initializeTargetFilter();
    }

    return super.handleStartDocument(event);
  }

  @Override
  protected Event handleTextUnit(Event event) {
    ITextUnit tu = event.getTextUnit();

    if (!tu.isTranslatable() || tu.isEmpty()) {
      return Event.createNoopEvent();
    }

    // Collapse whitespace if needed *before* segmentation
    if (isCollapseWhitespace()) {
      for (TextPart p : tu.getSource().getSegments()) {
        p.text.setCodedText(StringUtil.collapseWhitespace(p.text.getCodedText()));
      }
    }

    // Segment the source if requested
    if (isSegmentSource()) {
      tu.createSourceSegmentation(sourceSegmenter);
    }

    sourceTUs.add(tu);
    return Event.createNoopEvent();
  }

  /**
   * Initialize the target filter with the target input document.
   */
  protected void initializeTargetFilter() {
    if (targetFilter != null && targetInput != null) {
      targetFilter.open(targetInput);
    }
  }

  /**
   * Read all target text units from the target document.
   * This is called automatically by processTuEvents.
   */
  protected void readAllTargetTUs() {
    if (targetFilter == null) {
      return;
    }

    while (targetFilter.hasNext()) {
      Event event = targetFilter.next();

      if (event.isTextUnit()) {
        ITextUnit tu = event.getTextUnit();

        if (tu.isTranslatable() && !tu.isEmpty()) {
          // Collapse whitespace if needed *before* segmentation
          if (isCollapseWhitespace()) {
            for (TextPart p : tu.getSource().getSegments()) {
              p.text.setCodedText(StringUtil.collapseWhitespace(p.text.getCodedText()));
            }
          }

          // Segment the target if requested
          if (isSegmentTarget()) {
            tu.createSourceSegmentation(targetSegmenter);
          }

          targetTUs.add(tu);
        }
      }
    }
  }

  @Override
  protected void clear() {
    sourceTUs.clear();
    targetTUs.clear();

    if (targetFilter != null) {
      targetFilter.close();
    }
  }

  @Override
  protected void processTuEvents(List<Event> tuEvents) {
    // Read all target TUs
    readAllTargetTUs();

    // Perform the alignment (implemented by subclasses)
    performAlignment(sourceTUs, targetTUs);
  }
}