SolrTmFilter

package com.acumenvelocity.ath.solr.tm;

import java.io.IOException;
import java.util.Iterator;
import java.util.NoSuchElementException;
import java.util.UUID;

import org.apache.solr.client.solrj.SolrClient;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.response.QueryResponse;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.params.CursorMarkParams;

import com.acumenvelocity.ath.common.Const;
import com.acumenvelocity.ath.common.Log;

import net.sf.okapi.common.Event;
import net.sf.okapi.common.EventType;
import net.sf.okapi.common.MimeTypeMapper;
import net.sf.okapi.common.exceptions.OkapiException;
import net.sf.okapi.common.filters.AbstractFilter;
import net.sf.okapi.common.resource.Ending;
import net.sf.okapi.common.resource.Property;
import net.sf.okapi.common.resource.RawDocument;
import net.sf.okapi.common.resource.StartDocument;
import net.sf.okapi.common.resource.TextContainer;
import net.sf.okapi.common.resource.TextUnit;

/**
 * Streaming Solr translation memory filter designed for large-scale TM operations.
 * Leverages Solr's deep paging capabilities through cursor marks to efficiently
 * process millions of translation units without exhausting heap memory.
 *
 * This filter transforms Solr query results into Okapi event streams suitable
 * for integration with translation processing pipelines. Documents are retrieved
 * in configurable page sizes and processed incrementally, making it ideal for
 * production environments with substantial translation memory databases.
 */
public class SolrTmFilter extends AbstractFilter {

  private static final int STREAM_PAGE_SIZE = 500;
  private static final String CURSOR_SORT_FIELD = "id";
  private static final String TM_QUERY_TEMPLATE = "tmId:\"{}\"";

  private final SolrClient solrClient;
  private final String tmCollection;
  private final UUID tmId;
  private final SolrQuery baseQuery;

  private StreamingDocumentIterator documentStream;
  private boolean startEventEmitted;
  private boolean endEventEmitted;
  private boolean isOperational;

  /**
   * Constructs a streaming filter for a specific translation memory.
   *
   * @param solrClient   Connection to the Solr instance
   * @param tmCollection Target translation memory collection
   * @param tmId         Translation memory identifier to filter by
   * @throws IllegalArgumentException if any required parameter is null
   */
  public SolrTmFilter(SolrClient solrClient, String tmCollection, UUID tmId) {
    if (solrClient == null || tmCollection == null || tmId == null) {
      throw new IllegalArgumentException("Solr client, collection, and TM ID are mandatory");
    }

    setMimeType(MimeTypeMapper.DEFAULT_MIME_TYPE);
    setName("okf_solrtm");
    setDisplayName("Solr TM Filter");

    this.solrClient = solrClient;
    this.tmCollection = tmCollection;
    this.tmId = tmId;

    String queryString = Log.format(TM_QUERY_TEMPLATE, tmId);
    this.baseQuery = new SolrQuery(queryString);

    configureQueryForStreaming();
    initializeState();
  }

  /**
   * Configures the query object for optimal cursor-based streaming.
   */
  private void configureQueryForStreaming() {
    this.baseQuery.setRows(STREAM_PAGE_SIZE);
    this.baseQuery.setSort(CURSOR_SORT_FIELD, SolrQuery.ORDER.asc);
  }

  /**
   * Resets state variables to their initial configuration.
   */
  private void initializeState() {
    this.documentStream = null;
    this.startEventEmitted = false;
    this.endEventEmitted = false;
    this.isOperational = false;
  }

  /**
   * Activates the filter and establishes the streaming connection to Solr.
   * Validates connectivity before initializing the document iterator.
   *
   * @param input Raw document wrapper providing filter context
   * @throws OkapiException if Solr connectivity fails
   */
  @Override
  public void open(RawDocument input) {
    setSrcLoc(input.getSourceLocale());
    setTrgLoc(input.getTargetLocale());

    initializeState();

    try {
      validateSolrConnectivity();
      this.documentStream = new StreamingDocumentIterator();
      this.isOperational = true;
    } catch (SolrServerException | IOException ex) {
      throw new OkapiException("Cannot establish connection to TM collection: " + tmCollection, ex);
    }
  }

  /**
   * Performs a health check on the Solr connection.
   *
   * @throws SolrServerException if the server cannot be reached
   * @throws IOException         if network communication fails
   */
  private void validateSolrConnectivity() throws SolrServerException, IOException {
    solrClient.ping(tmCollection);
  }

  /**
   * Indicates whether more events are available in the processing stream.
   *
   * @return true if additional events can be retrieved
   */
  @Override
  public boolean hasNext() {
    if (!isOperational) {
      return false;
    }

    if (!startEventEmitted) {
      return true;
    }

    if (documentStream != null && documentStream.hasNext()) {
      return true;
    }

    return !endEventEmitted;
  }

  /**
   * Retrieves the next event from the processing stream.
   * Emits document boundary markers and text unit events in sequence.
   *
   * @return The next available event
   * @throws NoSuchElementException when the stream is depleted
   */
  @Override
  public Event next() {
    if (!hasNext()) {
      throw new NoSuchElementException("No additional events in stream");
    }

    if (!startEventEmitted) {
      startEventEmitted = true;
      return produceStartEvent();
    }

    if (documentStream != null && documentStream.hasNext()) {
      SolrDocument doc = documentStream.next();
      return convertToTextUnitEvent(doc);
    }

    if (!endEventEmitted) {
      endEventEmitted = true;
      return produceEndEvent();
    }

    throw new NoSuchElementException("No additional events in stream");
  }

  /**
   * Generates the opening document boundary event.
   *
   * @return Event signaling document stream initiation
   */
  private Event produceStartEvent() {
    StartDocument sd = new StartDocument("tm-stream");
    sd.setLocale(getSrcLoc());

    return new Event(EventType.START_DOCUMENT, sd);
  }

  /**
   * Generates the closing document boundary event.
   *
   * @return Event signaling document stream completion
   */
  private Event produceEndEvent() {
    Ending closer = new Ending("tm-stream");
    return new Event(EventType.END_DOCUMENT, closer);
  }

  /**
   * Transforms a Solr document into an Okapi text unit event.
   * Extracts source and target content along with metadata properties.
   *
   * @param doc The Solr document to convert
   * @return Event containing the translation unit
   */
  private Event convertToTextUnitEvent(SolrDocument doc) {
    String unitId = extractUnitIdentifier(doc);
    TextUnit unit = new TextUnit(unitId);

    String sourceContent = extractSourceText(doc);
    String targetContent = extractTargetText(doc);

    if (sourceContent != null && !sourceContent.trim().isEmpty()) {
      unit.setSource(new TextContainer(sourceContent));
    }

    if (targetContent != null && !targetContent.trim().isEmpty()) {
      unit.setTarget(getTrgLoc(), new TextContainer(targetContent));
    }

    populateMetadataProperties(unit, doc);

    return new Event(EventType.TEXT_UNIT, unit);
  }

  /**
   * Extracts or synthesizes a unique identifier for the translation unit.
   *
   * @param doc Source Solr document
   * @return Unique identifier string
   */
  private String extractUnitIdentifier(SolrDocument doc) {
    Object id = doc.getFieldValue("id");
    return id != null ? id.toString() : "segment-" + System.nanoTime();
  }

  /**
   * Locates source language content by probing standard field names.
   * Subclasses may override to customize field resolution.
   *
   * @param doc Source Solr document
   * @return Source text content or null
   */
  protected String extractSourceText(SolrDocument doc) {
    String[] possibleFields = { "source", "src_text", "original", "content_src" };

    for (String field : possibleFields) {
      Object value = doc.getFieldValue(field);
      if (value != null) {
        return value.toString();
      }
    }

    return null;
  }

  /**
   * Locates target language content by probing standard field names.
   * Subclasses may override to customize field resolution.
   *
   * @param doc Source Solr document
   * @return Target text content or null
   */
  protected String extractTargetText(SolrDocument doc) {
    String[] possibleFields = { "target", "tgt_text", "translation", "content_tgt" };

    for (String field : possibleFields) {
      Object value = doc.getFieldValue(field);
      if (value != null) {
        return value.toString();
      }
    }

    return null;
  }

  /**
   * Enriches the text unit with metadata extracted from the Solr document.
   * Subclasses can override to customize which properties are transferred.
   *
   * @param unit Target text unit
   * @param doc  Source Solr document
   */
  protected void populateMetadataProperties(TextUnit unit, SolrDocument doc) {
    copyFieldToProperty(unit, doc, Const.ATH_PROP_USER_ID);
    copyFieldToProperty(unit, doc, Const.ATH_PROP_TM_ID);
    copyFieldToProperty(unit, doc, Const.ATH_PROP_SRC_LANG);
    copyFieldToProperty(unit, doc, Const.ATH_PROP_TRG_LANG);
    copyFieldToProperty(unit, doc, Const.ATH_PROP_SOURCE_WITH_CODES);
    copyFieldToProperty(unit, doc, Const.ATH_PROP_TARGET_WITH_CODES);
    copyFieldToProperty(unit, doc, Const.ATH_PROP_CREATED_AT);
  }

  /**
   * Copies a single field value from document to text unit properties.
   *
   * @param unit      Target text unit
   * @param doc       Source Solr document
   * @param fieldName Field to copy
   */
  private void copyFieldToProperty(TextUnit unit, SolrDocument doc, String fieldName) {
    Object value = doc.getFieldValue(fieldName);
    if (value != null) {
      unit.setProperty(new Property(fieldName, value.toString()));
    }
  }

  /**
   * Terminates the filter and releases associated resources.
   * The Solr client remains open as it's externally managed.
   */
  @Override
  public void close() {
    isOperational = false;
    documentStream = null;
  }

  /**
   * Queries Solr for the total count of matching segments without retrieval.
   * Useful for displaying progress indicators or estimating resource needs.
   *
   * @return Total segment count matching the query
   * @throws OkapiException if the count operation fails
   */
  public long estimateTotalSegments() throws OkapiException {
    try {
      SolrQuery countQuery = baseQuery.getCopy();
      countQuery.setRows(0);
      countQuery.remove(CursorMarkParams.CURSOR_MARK_PARAM);

      QueryResponse response = solrClient.query(tmCollection, countQuery);
      return response.getResults().getNumFound();
    } catch (SolrServerException | IOException ex) {
      throw new OkapiException("Unable to estimate segment count", ex);
    }
  }

  /**
   * Returns the collection name being queried.
   *
   * @return Solr collection identifier
   */
  public String getTmCollection() {
    return tmCollection;
  }

  /**
   * Returns the translation memory identifier.
   *
   * @return TM ID being filtered
   */
  public UUID getTmId() {
    return tmId;
  }

  /**
   * Provides read access to the query configuration.
   *
   * @return Copy of the configured query
   */
  public SolrQuery getQuery() {
    return baseQuery.getCopy();
  }

  /**
   * Indicates whether the filter is currently active.
   *
   * @return true if filter is operational
   */
  public boolean isActive() {
    return isOperational;
  }

  /**
   * Iterator implementation leveraging Solr's deep paging mechanism.
   * Uses cursor marks to efficiently traverse large result sets without
   * the performance penalties associated with traditional offset pagination.
   */
  private class StreamingDocumentIterator implements Iterator<SolrDocument> {

    private String cursorPosition;
    private Iterator<SolrDocument> currentPage;
    private boolean moreDataAvailable;

    /**
     * Initializes the iterator at the first cursor position.
     */
    StreamingDocumentIterator() {
      this.cursorPosition = CursorMarkParams.CURSOR_MARK_START;
      this.currentPage = null;
      this.moreDataAvailable = true;
    }

    @Override
    public boolean hasNext() {
      if (!isOperational) {
        return false;
      }

      if (currentPage != null && currentPage.hasNext()) {
        return true;
      }

      if (!moreDataAvailable) {
        return false;
      }

      try {
        retrieveNextPage();
        return currentPage != null && currentPage.hasNext();
      } catch (Exception ex) {
        throw new OkapiException("Failed to retrieve next page from Solr", ex);
      }
    }

    @Override
    public SolrDocument next() {
      if (!hasNext()) {
        throw new NoSuchElementException("Iterator exhausted");
      }

      return currentPage.next();
    }

    /**
     * Fetches the subsequent page of documents using cursor pagination.
     * Updates the cursor position for the next retrieval cycle.
     *
     * @throws SolrServerException if the query fails
     * @throws IOException         if network issues occur
     */
    private void retrieveNextPage() throws SolrServerException, IOException {
      SolrQuery pageQuery = baseQuery.getCopy();
      pageQuery.set(CursorMarkParams.CURSOR_MARK_PARAM, cursorPosition);

      QueryResponse response = solrClient.query(tmCollection, pageQuery);
      SolrDocumentList documents = response.getResults();

      if (documents.isEmpty()) {
        moreDataAvailable = false;
        currentPage = null;
        return;
      }

      String nextCursor = response.getNextCursorMark();
      if (cursorPosition.equals(nextCursor)) {
        moreDataAvailable = false;
        currentPage = documents.iterator();
      } else {
        cursorPosition = nextCursor;
        currentPage = documents.iterator();
        moreDataAvailable = true;
      }
    }
  }
}