Source code for src.models.classifier.huggingface

from __future__ import annotations

# typing imports
from typing import TYPE_CHECKING

if TYPE_CHECKING:
    from ...config import Config
    from ...data_models import Taxonomy
    from logging import Logger

from ..base import Model

import mlflow
from transformers import pipeline, AutoModelForSequenceClassification



[docs]
class HuggingfaceModel(Model):

    def __init__(
            self,
            config: Config,
            logger: Logger,
            model_id: str,
            taxonomy: Taxonomy,
            stage: str = "Production"
    ) -> None:

        super().__init__(
            config=config,
            logger=logger,
            model_id=model_id
        )
        self._prep_labels(taxonomy)
        self._load_model(
            model_id=model_id,
            stage=stage
        )


[docs]
    def _load_model(
            self,
            model_id: str,
            stage: str
    ) -> None:

        self.logger.debug(f"model id {model_id}")

        if "mlflow" in self.model_id:
            self.logger.debug("SELECTING MLFLOW MODEL")
            components = mlflow.transformers.load_model(
                f"models:/{model_id.split(':/')[-1]}/{stage}",
                return_type="components"
            )

            tokenizer = components.get("tokenizer")
            tokenizer.model_max_len = 512  # hardcode max length of input to prevent further issues

            self.model = pipeline(
                "text-classification",
                model=components.get("model"),
                tokenizer=tokenizer
            )

        else:
            self.logger.debug("SELECTING HUGGINGFACE MODEL")
            self.model = pipeline(
                model_id=model_id,
                task="text-classification",
            )



[docs]
    def classify(self, text: str, multi_label: bool = True, **kwargs) -> dict[str, float]:

        self.logger.debug(f"pipeline: {type(self.model)} {self.model}")
        pipeline_output = self.model(
            text,
            return_all_scores=True,
            # padding=True,
            truncation=True
        )[0]

        reformatted_output = dict()
        for record in pipeline_output:
            reformatted_output[record.get("label")] = record.get("score")

        return reformatted_output


    classify.__doc__ = Model.classify.__doc__