
    LiR                        d dl mZ d dlZd dlmZ d dlmZmZ d dlm	Z	 d dl
mZ d dlmZ d dlmZ d d	lmZ d d
lmZ d dlmZmZ  ej2                  e      Z G d de      Z G d de      Ze G d de             Zy)    )annotationsN)Callable)	dataclassfield)Union)parse)TrainingArguments)__version__)ParallelMode)ExplicitEnum)DefaultBatchSamplerMultiDatasetDefaultBatchSamplerc                       e Zd ZdZdZdZdZdZy)BatchSamplersa  
    Stores the acceptable string identifiers for batch samplers.

    The batch sampler is responsible for determining how samples are grouped into batches during training.
    Valid options are:

    - ``BatchSamplers.BATCH_SAMPLER``: **[default]** Uses :class:`~sentence_transformers.sampler.DefaultBatchSampler`, the default
      PyTorch batch sampler.
    - ``BatchSamplers.NO_DUPLICATES``: Uses :class:`~sentence_transformers.sampler.NoDuplicatesBatchSampler`,
      ensuring no duplicate samples in a batch.
    - ``BatchSamplers.NO_DUPLICATES_HASHED``: Uses :class:`~sentence_transformers.sampler.NoDuplicatesBatchSampler`
      with ``precompute_hashes=True``, a variant that precomputes hashes for faster duplicate checks at a small memory cost.
      Requires the ``xxhash`` library to be installed.

      Both are recommended for losses that use in-batch negatives, such as:

        - :class:`~sentence_transformers.losses.MultipleNegativesRankingLoss`
        - :class:`~sentence_transformers.losses.CachedMultipleNegativesRankingLoss`
        - :class:`~sentence_transformers.losses.MultipleNegativesSymmetricRankingLoss`
        - :class:`~sentence_transformers.losses.CachedMultipleNegativesSymmetricRankingLoss`
        - :class:`~sentence_transformers.losses.MegaBatchMarginLoss`
        - :class:`~sentence_transformers.losses.GISTEmbedLoss`
        - :class:`~sentence_transformers.losses.CachedGISTEmbedLoss`
    - ``BatchSamplers.GROUP_BY_LABEL``: Uses :class:`~sentence_transformers.sampler.GroupByLabelBatchSampler`,
      which constructs each batch by drawing at least 2 samples from each of at least 2 distinct labels.
      This guarantees every batch contains multiple classes, which is required for in-batch triplet mining.
      Recommended for:

        - :class:`~sentence_transformers.losses.BatchAllTripletLoss`
        - :class:`~sentence_transformers.losses.BatchHardSoftMarginTripletLoss`
        - :class:`~sentence_transformers.losses.BatchHardTripletLoss`
        - :class:`~sentence_transformers.losses.BatchSemiHardTripletLoss`

    If you want to use a custom batch sampler, then you can subclass
    :class:`~sentence_transformers.sampler.DefaultBatchSampler` and pass the class (not an instance) to the
    ``batch_sampler`` argument in :class:`~sentence_transformers.training_args.SentenceTransformerTrainingArguments`
    (or :class:`~sentence_transformers.cross_encoder.training_args.CrossEncoderTrainingArguments`, etc.).
    Alternatively, you can pass a function that accepts ``dataset``, ``batch_size``, ``drop_last``,
    ``valid_label_columns``, ``generator``, and ``seed`` and returns a
    :class:`~sentence_transformers.sampler.DefaultBatchSampler` instance.

    Usage:
        ::

            from sentence_transformers import SentenceTransformer, SentenceTransformerTrainer, SentenceTransformerTrainingArguments
            from sentence_transformers.training_args import BatchSamplers
            from sentence_transformers.losses import MultipleNegativesRankingLoss
            from datasets import Dataset

            model = SentenceTransformer("microsoft/mpnet-base")
            train_dataset = Dataset.from_dict({
                "anchor": ["It's nice weather outside today.", "He drove to work."],
                "positive": ["It's so sunny.", "He took the car to the office."],
            })
            loss = MultipleNegativesRankingLoss(model)
            args = SentenceTransformerTrainingArguments(
                output_dir="checkpoints",
                batch_sampler=BatchSamplers.NO_DUPLICATES,
            )
            trainer = SentenceTransformerTrainer(
                model=model,
                args=args,
                train_dataset=train_dataset,
                loss=loss,
            )
            trainer.train()
    batch_samplerno_duplicatesno_duplicates_hashedgroup_by_labelN)__name__
__module____qualname____doc__BATCH_SAMPLERNO_DUPLICATESNO_DUPLICATES_HASHEDGROUP_BY_LABEL     i/var/www/html/lcp-python-backend/venv/lib/python3.12/site-packages/sentence_transformers/training_args.pyr   r      s     BH $M#M1%Nr   r   c                      e Zd ZdZdZdZy)MultiDatasetBatchSamplersa  
    Stores the acceptable string identifiers for multi-dataset batch samplers.

    The multi-dataset batch sampler is responsible for determining in what order batches are sampled from multiple
    datasets during training. Valid options are:

    - ``MultiDatasetBatchSamplers.ROUND_ROBIN``: Uses :class:`~sentence_transformers.sampler.RoundRobinBatchSampler`,
      which uses round-robin sampling from each dataset until one is exhausted.
      With this strategy, it's likely that not all samples from each dataset are used, but each dataset is sampled
      from equally.
    - ``MultiDatasetBatchSamplers.PROPORTIONAL``: **[default]** Uses :class:`~sentence_transformers.sampler.ProportionalBatchSampler`,
      which samples from each dataset in proportion to its size.
      With this strategy, all samples from each dataset are used and larger datasets are sampled from more frequently.

    If you want to use a custom multi-dataset batch sampler, then you can subclass
    :class:`~sentence_transformers.sampler.MultiDatasetDefaultBatchSampler` and pass the class (not an instance) to the
    ``multi_dataset_batch_sampler`` argument in :class:`~sentence_transformers.training_args.SentenceTransformerTrainingArguments`.
    (or :class:`~sentence_transformers.cross_encoder.training_args.CrossEncoderTrainingArguments`, etc.). Alternatively,
    you can pass a function that accepts ``dataset`` (a :class:`~torch.utils.data.ConcatDataset`), ``batch_samplers``
    (i.e. a list of batch sampler for each of the datasets in the :class:`~torch.utils.data.ConcatDataset`), ``generator``,
    and ``seed`` and returns a :class:`~sentence_transformers.sampler.MultiDatasetDefaultBatchSampler` instance.

    Usage:
        ::

            from sentence_transformers import SentenceTransformer, SentenceTransformerTrainer, SentenceTransformerTrainingArguments
            from sentence_transformers.training_args import MultiDatasetBatchSamplers
            from sentence_transformers.losses import CoSENTLoss
            from datasets import Dataset, DatasetDict

            model = SentenceTransformer("microsoft/mpnet-base")
            train_general = Dataset.from_dict({
                "sentence_A": ["It's nice weather outside today.", "He drove to work."],
                "sentence_B": ["It's so sunny.", "He took the car to the bank."],
                "score": [0.9, 0.4],
            })
            train_medical = Dataset.from_dict({
                "sentence_A": ["The patient has a fever.", "The doctor prescribed medication.", "The patient is sweating."],
                "sentence_B": ["The patient feels hot.", "The medication was given to the patient.", "The patient is perspiring."],
                "score": [0.8, 0.6, 0.7],
            })
            train_legal = Dataset.from_dict({
                "sentence_A": ["This contract is legally binding.", "The parties agree to the terms and conditions."],
                "sentence_B": ["Both parties acknowledge their obligations.", "By signing this agreement, the parties enter into a legal relationship."],
                "score": [0.7, 0.8],
            })
            train_dataset = DatasetDict({
                "general": train_general,
                "medical": train_medical,
                "legal": train_legal,
            })

            loss = CoSENTLoss(model)
            args = SentenceTransformerTrainingArguments(
                output_dir="checkpoints",
                multi_dataset_batch_sampler=MultiDatasetBatchSamplers.PROPORTIONAL,
            )
            trainer = SentenceTransformerTrainer(
                model=model,
                args=args,
                train_dataset=train_dataset,
                loss=loss,
            )
            trainer.train()
    round_robinproportionalN)r   r   r   r   ROUND_ROBINPROPORTIONALr   r   r   r!   r!   ^   s    @D  K!Lr   r!   c                  ,    e Zd ZU dZg dZ edddi      Zded<    eej                  dd	i      Z
d
ed<    eej                  ddi      Zded<    eeddi      Zded<    eeddi      Zded<    ed ddi      Zded<    fdZ fdZ xZS )$SentenceTransformerTrainingArgumentsa  
    SentenceTransformerTrainingArguments extends :class:`~transformers.TrainingArguments` with additional arguments
    specific to Sentence Transformers. See :class:`~transformers.TrainingArguments` for the complete list of
    available arguments.

    Args:
        output_dir (`str`):
            The output directory where the model checkpoints will be written.
        prompts (`Union[Dict[str, Dict[str, str]], Dict[str, str], str]`, *optional*):
            The prompts to use for each column in the training, evaluation and test datasets. Four formats are accepted:

            1. `str`: A single prompt to use for all columns in the datasets, regardless of whether the training/evaluation/test
               datasets are :class:`datasets.Dataset` or a :class:`datasets.DatasetDict`.
            2. `Dict[str, str]`: A dictionary mapping column names to prompts, regardless of whether the training/evaluation/test
               datasets are :class:`datasets.Dataset` or a :class:`datasets.DatasetDict`.
            3. `Dict[str, str]`: A dictionary mapping dataset names to prompts. This should only be used if your training/evaluation/test
               datasets are a :class:`datasets.DatasetDict` or a dictionary of :class:`datasets.Dataset`.
            4. `Dict[str, Dict[str, str]]`: A dictionary mapping dataset names to dictionaries mapping column names to
               prompts. This should only be used if your training/evaluation/test datasets are a
               :class:`datasets.DatasetDict` or a dictionary of :class:`datasets.Dataset`.

        batch_sampler (Union[:class:`~sentence_transformers.training_args.BatchSamplers`, `str`, :class:`~sentence_transformers.sampler.DefaultBatchSampler`, Callable[[...], :class:`~sentence_transformers.sampler.DefaultBatchSampler`]], *optional*):
            The batch sampler to use. See :class:`~sentence_transformers.training_args.BatchSamplers` for valid options.
            Defaults to ``BatchSamplers.BATCH_SAMPLER``.
        multi_dataset_batch_sampler (Union[:class:`~sentence_transformers.training_args.MultiDatasetBatchSamplers`, `str`, :class:`~sentence_transformers.sampler.MultiDatasetDefaultBatchSampler`, Callable[[...], :class:`~sentence_transformers.sampler.MultiDatasetDefaultBatchSampler`]], *optional*):
            The multi-dataset batch sampler to use. See :class:`~sentence_transformers.training_args.MultiDatasetBatchSamplers`
            for valid options. Defaults to ``MultiDatasetBatchSamplers.PROPORTIONAL``.
        router_mapping (`Dict[str, str] | Dict[str, Dict[str, str]]`, *optional*):
            A mapping of dataset column names to Router routes, like "query" or "document". This is used to specify
            which Router submodule to use for each dataset. Two formats are accepted:

            1. `Dict[str, str]`: A mapping of column names to routes.
            2. `Dict[str, Dict[str, str]]`: A mapping of dataset names to a mapping of column names to routes for
               multi-dataset training/evaluation.
        learning_rate_mapping (`Dict[str, float] | None`, *optional*):
            A mapping of parameter name regular expressions to learning rates. This allows you to set different
            learning rates for different parts of the model, e.g., `{'SparseStaticEmbedding\.*': 1e-3}` for the
            SparseStaticEmbedding module. This is useful when you want to fine-tune specific parts of the model
            with different learning rates.
    )accelerator_configfsdp_config	deepspeedgradient_checkpointing_kwargslr_scheduler_kwargspromptsrouter_mappinglearning_rate_mappingNhelpzThe prompts to use for each column in the datasets. Either 1) a single string prompt, 2) a mapping of column names to prompts, 3) a mapping of dataset names to prompts, or 4) a mapping of dataset names to a mapping of column names to prompts.)defaultmetadataz;Union[str, None, dict[str, str], dict[str, dict[str, str]]]r-   zThe batch sampler to use.zRUnion[BatchSamplers, str, DefaultBatchSampler, Callable[..., DefaultBatchSampler]]r   z'The multi-dataset batch sampler to use.zvUnion[MultiDatasetBatchSamplers, str, MultiDatasetDefaultBatchSampler, Callable[..., MultiDatasetDefaultBatchSampler]]multi_dataset_batch_samplerzA mapping of dataset column names to Router routes, like "query" or "document". Either 1) a mapping of column names to routes or 2) a mapping of dataset names to a mapping of column names to routes for multi-dataset training/evaluation. )default_factoryr2   r.   zA mapping of parameter name regular expressions to learning rates. This allows you to set different learning rates for different parts of the model, e.g., {'SparseStaticEmbedding\.*': 1e-3} for the SparseStaticEmbedding module.z"Union[str, None, dict[str, float]]r/   c                 @    t        t              t        d      k\  rd S dS )N5.0.0        )parse_versiontransformers_versionr   r   r   <lambda>z-SentenceTransformerTrainingArguments.<lambda>  s    6J(K}]dOe(e kn r   zThis argument is deprecated and will be removed in the future. If you're on Transformers v5+, then you should use `warmup_steps` instead as it also works with float values.zfloat | Nonewarmup_ratioc                   t        t              t        d      k\  rI| j                  | j                  dk(  r| j                  | _        d | _        t        j                  d       nYt        | j                  t              r?d| j                  cxk  rdk  r*n n'| j                  dk(  r| j                  | _        d| _        t        | %          t        | j                  t              rt        | j                        n| j                  | _
        t        | j                  t              rt        | j                        n| j                  | _        | j                  | j                  ni | _        t        | j                  t              rt!        d      | j"                  | j"                  ni | _        t        | j"                  t              rt!        d      d| _        d	| _        | j(                  t*        j,                  k(  r&| j.                  d
k7  rt        j                  d       y y | j(                  t*        j0                  k(  r9| j2                  s,| j.                  d
k7  rt        j                  d       d| _        y y y )Nr6   r   a  The `warmup_ratio` argument is deprecated in Transformers v5+, and will also be removed from Sentence Transformers once support for Transformers v4 is dropped. Since you're using Transformers v5+, please use `warmup_steps` (as a float) to specify the warmup ratio instead.r7   g      ?zThe `router_mapping` argument must be a dictionary mapping dataset column names to Router routes, like 'query' or 'document'. A stringified dictionary also works.zThe `learning_rate_mapping` argument must be a dictionary mapping parameter name regular expressions to learning rates. A stringified dictionary also works.TFunusedzCurrently using DataParallel (DP) for multi-gpu training, while DistributedDataParallel (DDP) is recommended for faster training. See https://sbert.net/docs/sentence_transformer/training/distributed.html for more information.zWhen using DistributedDataParallel (DDP), it is recommended to set `dataloader_drop_last=True` to avoid hanging issues with an uneven last batch. Setting `dataloader_drop_last=True`.)r8   r9   r;   warmup_stepsloggerwarning
isinstancefloatsuper__post_init__r   strr   r3   r!   r.   
ValueErrorr/   prediction_loss_onlyddp_broadcast_buffersparallel_moder   NOT_DISTRIBUTED
output_dirDISTRIBUTEDdataloader_drop_last)self	__class__s    r   rD   z2SentenceTransformerTrainingArguments.__post_init__	  s:    -.-2HH   ,1B1Ba1G$($5$5!$(!t $++U3d>O>O8URU8UZ^ZkZkorZr$($5$5!$%! 2<D<N<NPS1TM$,,-Z^ZlZl 	
 $::C@ &d&F&FG11 	( 6:5H5H5Td11Z\d))3/ S 
 DHC]C]CiT%?%?oq"d00#6 J  %)! &+"!=!== (*v + <#;#;;DD]D] (*; )-D% E^;r   c                h    t         |          }t        |d         r|d= t        |d         r|d= |S )Nr   r3   )rC   to_dictcallable)rN   training_args_dictrO   s     r   rQ   z,SentenceTransformerTrainingArguments.to_dict\  sD    "W_.&78"?3&'DEF"#@A!!r   )r   r   r   r   _VALID_DICT_FIELDSr   r-   __annotations__r   r   r   r!   r%   r3   dictr.   r/   r;   rD   rQ   __classcell__)rO   s   @r   r'   r'      s   'Z	 LQ d
LGH  in++v?Z6[iMe 
 	)66&JsAt	   " 
 SX P
SNO  AF X
A=  "'n ]
"L, Q-f" "r   r'   )
__future__r   loggingcollections.abcr   dataclassesr   r   typingr   packaging.versionr   r8   transformersr	   TransformersTrainingArgumentsr
   r9   transformers.training_argsr   transformers.utilsr   sentence_transformers.samplerr   r   	getLoggerr   r?   r   r!   r'   r   r   r   <module>rd      su    "  $ (  4 K < 3 + ^			8	$H&L H&VD" D"N |"+H |" |"r   