Upload folder using huggingface_hub

Browse files

Files changed (6) hide show

config.json +1 -1
configuration_phi3.py +42 -29
model.safetensors +1 -1
modeling_phi3.py +125 -145
sample_finetune.py +0 -1
tokenizer_config.json +1 -1

config.json CHANGED Viewed

@@ -29,7 +29,7 @@
     "rms_norm_eps": 1e-05,
     "rope_scaling": null,
     "rope_theta": 10000.0,
-    "sliding_window": 2047,
     "tie_word_embeddings": false,
     "torch_dtype": "bfloat16",
     "transformers_version": "4.39.3",

     "rms_norm_eps": 1e-05,
     "rope_scaling": null,
     "rope_theta": 10000.0,
+    "sliding_window": 2048,
     "tie_word_embeddings": false,
     "torch_dtype": "bfloat16",
     "transformers_version": "4.39.3",

configuration_phi3.py CHANGED Viewed

@@ -83,10 +83,12 @@ class Phi3Config(PretrainedConfig):
         rope_theta (`float`, *optional*, defaults to 10000.0):
             The base period of the RoPE embeddings.
         rope_scaling (`dict`, *optional*):
-            The scaling factor for the RoPE embeddings. If `None`, no scaling is applied. If a dictionary, it must
-            contain the following keys: `type`, `short_factor` and `long_factor`. The `type` must be `longrope` and
             the `short_factor` and `long_factor` must be lists of numbers with the same length as the hidden size
             divided by the number of attention heads divided by 2.
         eos_token_id (`int`, *optional*, defaults to 32000):
             The id of the "end-of-sequence" token.
         pad_token_id (`int`, *optional*, defaults to 32000):
@@ -132,6 +134,7 @@ class Phi3Config(PretrainedConfig):
         tie_word_embeddings=False,
         rope_theta=10000.0,
         rope_scaling=None,
         eos_token_id=32000,
         pad_token_id=32000,
         sliding_window=None,
@@ -158,9 +161,11 @@ class Phi3Config(PretrainedConfig):
         self.use_cache = use_cache
         self.rope_theta = rope_theta
         self.rope_scaling = rope_scaling
         self.sliding_window = sliding_window
         super().__init__(
             eos_token_id=eos_token_id,
             pad_token_id=pad_token_id,
             tie_word_embeddings=tie_word_embeddings,
@@ -168,33 +173,41 @@ class Phi3Config(PretrainedConfig):
         )
     def _rope_scaling_validation(self):
         if self.rope_scaling is None:
             return
-        assert (
-            (isinstance(self.rope_scaling, dict))
-            and ("type" in self.rope_scaling)
-            and ("short_factor" in self.rope_scaling)
-            and ("long_factor" in self.rope_scaling)
-        ), (
-            "`rope_scaling` must be a dictionary with three keys: `type`, `short_factor` and `long_factor`, "
-            f"got {self.rope_scaling}."
-        )
-        assert self.rope_scaling["type"].lower() == "longrope", "RoPE scaling type must be `longrope`."
-        short_factor = self.rope_scaling["short_factor"]
-        assert isinstance(short_factor, list) and all(
-            isinstance(x, (int, float)) for x in short_factor
-        ), f"RoPE scaling factor must be a list of numbers, got {short_factor}."
-        assert (
-            len(short_factor) == self.hidden_size // self.num_attention_heads // 2
-        ), f"Length of RoPE scaling factor must be half of the attention head, got {short_factor}."
-        long_factor = self.rope_scaling["long_factor"]
-        assert isinstance(long_factor, list) and all(
-            isinstance(x, (int, float)) for x in long_factor
-        ), f"RoPE scaling factor must be a list of numbers, got {long_factor}."
-        assert (
-            len(long_factor) == self.hidden_size // self.num_attention_heads // 2
-        ), f"Length of RoPE scaling factor must be half of the attention head, got {long_factor}."

         rope_theta (`float`, *optional*, defaults to 10000.0):
             The base period of the RoPE embeddings.
         rope_scaling (`dict`, *optional*):
+            The scaling strategy for the RoPE embeddings. If `None`, no scaling is applied. If a dictionary, it must
+            contain the following keys: `type`, `short_factor` and `long_factor`. The `type` must be either `su` or `yarn` and
             the `short_factor` and `long_factor` must be lists of numbers with the same length as the hidden size
             divided by the number of attention heads divided by 2.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            The id of the "beginning-of-sequence" token.
         eos_token_id (`int`, *optional*, defaults to 32000):
             The id of the "end-of-sequence" token.
         pad_token_id (`int`, *optional*, defaults to 32000):
         tie_word_embeddings=False,
         rope_theta=10000.0,
         rope_scaling=None,
+        bos_token_id=1,
         eos_token_id=32000,
         pad_token_id=32000,
         sliding_window=None,
         self.use_cache = use_cache
         self.rope_theta = rope_theta
         self.rope_scaling = rope_scaling
+        self._rope_scaling_validation()
         self.sliding_window = sliding_window
         super().__init__(
+            bos_token_id=bos_token_id,
             eos_token_id=eos_token_id,
             pad_token_id=pad_token_id,
             tie_word_embeddings=tie_word_embeddings,
         )
     def _rope_scaling_validation(self):
+        """
+        Validate the `rope_scaling` configuration.
+        """
         if self.rope_scaling is None:
             return
+        if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 3:
+            raise ValueError(
+                "`rope_scaling` must be a dictionary with three fields, `type`, `short_factor` and `long_factor`, "
+                f"got {self.rope_scaling}"
+            )
+        rope_scaling_type = self.rope_scaling.get("type", None)
+        rope_scaling_short_factor = self.rope_scaling.get("short_factor", None)
+        rope_scaling_long_factor = self.rope_scaling.get("long_factor", None)
+        if rope_scaling_type is None or rope_scaling_type not in ["su", "yarn"]:
+            raise ValueError(f"`rope_scaling`'s type field must be one of ['su', 'yarn'], got {rope_scaling_type}")
+        if not (
+            isinstance(rope_scaling_short_factor, list)
+            and all(isinstance(x, (int, float)) for x in rope_scaling_short_factor)
+        ):
+            raise ValueError(
+                f"`rope_scaling`'s short_factor field must be a list of numbers, got {rope_scaling_short_factor}"
+            )
+        if not len(rope_scaling_short_factor) == self.hidden_size // self.num_attention_heads // 2:
+            raise ValueError(
+                f"`rope_scaling`'s short_factor field must have length {self.hidden_size // self.num_attention_heads // 2}, got {len(rope_scaling_short_factor)}"
+            )
+        if not (
+            isinstance(rope_scaling_long_factor, list)
+            and all(isinstance(x, (int, float)) for x in rope_scaling_long_factor)
+        ):
+            raise ValueError(
+                f"`rope_scaling`'s long_factor field must be a list of numbers, got {rope_scaling_long_factor}"
+            )
+        if not len(rope_scaling_long_factor) == self.hidden_size // self.num_attention_heads // 2:
+            raise ValueError(
+                f"`rope_scaling`'s long_factor field must have length {self.hidden_size // self.num_attention_heads // 2}, got {len(rope_scaling_long_factor)}"
+            )

model.safetensors CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:608877a3271792ffd6986f15cfb08c86008bd26e2562e344c57fd574213888d5
 size 2291290600

 version https://git-lfs.github.com/spec/v1
+oid sha256:e908f149cf6056b2f8fee5e1443cdae521be06558907eb952fbd5f383ad533b8
 size 2291290600

modeling_phi3.py CHANGED Viewed

@@ -40,6 +40,7 @@ from transformers.utils import (
     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
     is_flash_attn_greater_or_equal_2_10,
     logging,
     replace_return_docstrings,
@@ -54,26 +55,17 @@ logger = logging.get_logger(__name__)
 _flash_supports_window_size = False
 try:
     from flash_attn import flash_attn_func, flash_attn_varlen_func
     _flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters)
-    if not _flash_supports_window_size:
-        raise ValueError("Please update flash-attention to support window size.")
-    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
-    from flash_attn.ops.activations import swiglu
-    from flash_attn.ops.rms_norm import RMSNorm as Phi3FlashRMSNorm
-# else:
 except ImportError as error:
     logger.warning(
-        f"Flash Attention or Flash Attention Submodules not found, consider installing for better performance: {error}."
     )
     if not _flash_supports_window_size:
         logger.warning(
-            "This version of flash does not support window size. Please use `attn_implementation='eager'` or upgrade flash-attn library."
         )
-    swiglu = None
-    Phi3FlashRMSNorm = None
 _CHECKPOINT_FOR_DOC = "microsoft/Phi-3-mini-4k-instruct"
 _CONFIG_FOR_DOC = "Phi3Config"
@@ -103,9 +95,6 @@ class Phi3RMSNorm(nn.Module):
         return self.weight * hidden_states.to(input_dtype)
-PHI3_NORM_CLASS = Phi3RMSNorm if Phi3FlashRMSNorm is None else Phi3FlashRMSNorm
 # Copied from transformers.models.llama.modeling_llama._get_unpad_data
 def _get_unpad_data(attention_mask):
     seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
@@ -119,7 +108,7 @@ def _get_unpad_data(attention_mask):
     )
-# Copied from transformers.models.mistral.modeling_mistral.MistralRotaryEmbedding with Mistral->Phi3
 class Phi3RotaryEmbedding(nn.Module):
     def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
         super().__init__()
@@ -127,98 +116,109 @@ class Phi3RotaryEmbedding(nn.Module):
         self.dim = dim
         self.max_position_embeddings = max_position_embeddings
         self.base = base
-        inv_freq = 1.0 / (self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64).float().to(device) / self.dim))
-        self.register_buffer("inv_freq", inv_freq, persistent=False)
-        # Build here to make `torch.jit.trace` work.
-        self._set_cos_sin_cache(
-            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
-        )
-    def _set_cos_sin_cache(self, seq_len, device, dtype):
-        self.max_seq_len_cached = seq_len
-        t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq)
-        freqs = torch.outer(t, self.inv_freq)
-        # Different from paper, but it uses a different permutation in order to obtain the same calculation
-        emb = torch.cat((freqs, freqs), dim=-1)
-        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
-        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
-    def forward(self, x, seq_len=None):
         # x: [bs, num_attention_heads, seq_len, head_size]
-        if seq_len > self.max_seq_len_cached:
-            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
-        return (
-            self.cos_cached[:seq_len].to(dtype=x.dtype),
-            self.sin_cached[:seq_len].to(dtype=x.dtype),
-        )
-class Phi3LongScaledRotaryEmbedding(nn.Module):
-    def __init__(
-        self,
-        dim,
-        short_factor,
-        long_factor,
-        max_position_embeddings=4096,
-        original_max_position_embeddings=4096,
-        base=10000,
-        magnitude_scaling_policy="su",
-    ):
-        super().__init__()
-        self.dim = dim
-        self.max_position_embeddings = max_position_embeddings
-        self.original_max_position_embeddings = original_max_position_embeddings
-        self.base = base
-        if magnitude_scaling_policy == "su":
-            self._calc_mscale = self._calc_mscale_su
-        elif magnitude_scaling_policy == "yarn":
-            self._calc_mscale = self._calc_mscale_yarn
-        else:
-            self._calc_mscale = lambda scale: float(scale)
-        self.short_factor = short_factor
-        self.long_factor = long_factor
-    def _calc_mscale_su(self, scale):
-        if scale <= 1.0:
-            return 1.0
-        return math.sqrt(1 + math.log(scale) / math.log(self.original_max_position_embeddings))
-    def _calc_mscale_yarn(self, scale):
-        if scale <= 1.0:
-            return 1.0
-        return 0.1 * math.log(scale) + 1.0
-    @torch.no_grad()
-    def forward(self, x, seq_len=None):
-        if seq_len is None:
-            seq_len = x.shape[-2]
-        t = torch.arange(seq_len, device=x.device, dtype=torch.float32)
         if seq_len > self.original_max_position_embeddings:
-            t = torch.arange(seq_len, device=x.device, dtype=torch.float32)
-            rescale_factors = torch.tensor(self.long_factor, dtype=torch.float32, device=x.device)
         else:
-            t = torch.arange(self.original_max_position_embeddings, device=x.device, dtype=torch.float32)
-            rescale_factors = torch.tensor(self.short_factor, dtype=torch.float32, device=x.device)
-        assert rescale_factors.shape == (
-            self.dim // 2,
-        ), f"misaligned shape for LongRoPE rescale factors: {rescale_factors.shape}"
-        inv_freq = 1.0 / (
-            rescale_factors * (self.base ** (torch.arange(0, self.dim, 2).float().to(x.device) / self.dim))
-        )
-        freqs = torch.outer(t, inv_freq)
-        mscale = self._calc_mscale(self.max_position_embeddings / self.original_max_position_embeddings)
-        emb = torch.cat((freqs, freqs), dim=-1)
-        return (emb.cos() * mscale).to(x.dtype), (emb.sin() * mscale).to(x.dtype)
 # Copied from transformers.models.llama.modeling_llama.rotate_half
@@ -229,7 +229,8 @@ def rotate_half(x):
     return torch.cat((-x2, x1), dim=-1)
-def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
     """Applies Rotary Position Embedding to the query and key tensors.
     Args:
@@ -237,9 +238,8 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
         k (`torch.Tensor`): The key tensor.
         cos (`torch.Tensor`): The cosine part of the rotary embedding.
         sin (`torch.Tensor`): The sine part of the rotary embedding.
-        position_ids (`torch.Tensor`):
-            The position indices of the tokens corresponding to the query and key tensors. For example, this can be
-            used to pass offsetted position ids when working with a KV-cache.
         unsqueeze_dim (`int`, *optional*, defaults to 1):
             The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
             sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
@@ -250,27 +250,14 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
     Returns:
         `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
     """
-    cos = cos[position_ids].unsqueeze(unsqueeze_dim)
-    sin = sin[position_ids].unsqueeze(unsqueeze_dim)
-    # Need fp32 here to match logits
-    q_embed = (q.to(dtype=torch.float32) * cos.to(dtype=torch.float32)) + (
-        rotate_half(q).to(dtype=torch.float32) * sin.to(dtype=torch.float32)
-    )
-    k_embed = (k.to(dtype=torch.float32) * cos.to(dtype=torch.float32)) + (
-        rotate_half(k).to(dtype=torch.float32) * sin.to(dtype=torch.float32)
-    )
-    return q_embed.to(q.dtype), k_embed.to(k.dtype)
 class Phi3MLP(nn.Module):
-    """Gated Linear Unit.
-    Reference:
-        Language Modeling with Gated Convolutional Networks.
-        https://arxiv.org/pdf/1612.08083v3.pdf.
-    """
     def __init__(self, config):
         super().__init__()
@@ -281,17 +268,12 @@ class Phi3MLP(nn.Module):
         self.activation_fn = ACT2FN[config.hidden_act]
     def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor:
-        y = self.gate_up_proj(hidden_states)
-        # Special case for SwiGLU
-        if self.config.hidden_act == "silu" and swiglu is not None:
-            gate, y = y.chunk(2, dim=-1)
-            y = swiglu(gate, y)
-        else:
-            gate, y = y.chunk(2, dim=-1)
-            y = y * self.activation_fn(gate)
-        return self.down_proj(y)
 # Copied from transformers.models.llama.modeling_llama.repeat_kv with llama->phi
@@ -341,9 +323,10 @@ class Phi3Attention(nn.Module):
         op_size = self.num_heads * self.head_dim + 2 * (self.num_key_value_heads * self.head_dim)
         self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
         self.qkv_proj = nn.Linear(self.hidden_size, op_size, bias=False)
         if self.rope_scaling is None:
             self.rotary_emb = Phi3RotaryEmbedding(
                 self.head_dim,
@@ -351,17 +334,13 @@ class Phi3Attention(nn.Module):
                 base=self.rope_theta,
             )
         else:
-            self.rotary_emb = Phi3LongScaledRotaryEmbedding(
-                self.head_dim,
-                self.config.rope_scaling["short_factor"],
-                self.config.rope_scaling["long_factor"],
-                max_position_embeddings=self.config.max_position_embeddings,
-                original_max_position_embeddings=self.config.original_max_position_embeddings,
-                base=self.config.rope_theta,
-            )
-    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
-        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
     def forward(
         self,
@@ -395,7 +374,8 @@ class Phi3Attention(nn.Module):
                     "with a layer index."
                 )
             kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
-        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
         query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
         if past_key_value is not None:
@@ -515,7 +495,7 @@ class Phi3FlashAttention2(Phi3Attention):
         # Because the input can be padded, the absolute sequence length depends on the max position id.
         rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1
-        cos, sin = self.rotary_emb(value_states, seq_len=rotary_seq_len)
         query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
@@ -802,7 +782,7 @@ class Phi3SdpaAttention(Phi3Attention):
         kv_seq_len = key_states.shape[-2]
         if past_key_value is not None:
             kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
-        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
         query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
@@ -859,11 +839,11 @@ class Phi3DecoderLayer(nn.Module):
         self.self_attn = PHI3_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx=layer_idx)
         self.mlp = Phi3MLP(config)
-        self.input_layernorm = PHI3_NORM_CLASS(config.hidden_size, eps=config.rms_norm_eps)
         self.resid_attn_dropout = nn.Dropout(config.resid_pdrop)
         self.resid_mlp_dropout = nn.Dropout(config.resid_pdrop)
-        self.post_attention_layernorm = PHI3_NORM_CLASS(config.hidden_size, eps=config.rms_norm_eps)
     def forward(
         self,
@@ -1066,9 +1046,8 @@ class Phi3Model(Phi3PreTrainedModel):
         self.layers = nn.ModuleList(
             [Phi3DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
         )
-        self.norm = PHI3_NORM_CLASS(config.hidden_size, eps=config.rms_norm_eps)
         self._attn_implementation = config._attn_implementation
         self.gradient_checkpointing = False
         # Initialize weights and apply final processing
@@ -1255,6 +1234,7 @@ class Phi3ForCausalLM(Phi3PreTrainedModel):
     def get_decoder(self):
         return self.model
     @add_start_docstrings_to_model_forward(PHI3_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
@@ -1284,8 +1264,8 @@ class Phi3ForCausalLM(Phi3PreTrainedModel):
         ```python
         >>> from transformers import AutoTokenizer, Phi3ForCausalLM
-        >>> model = Phi3ForCausalLM.from_pretrained("microsoft/phi-3")
-        >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-3")
         >>> prompt = "This is an example script ."
         >>> inputs = tokenizer(prompt, return_tensors="pt")
@@ -1293,7 +1273,7 @@ class Phi3ForCausalLM(Phi3PreTrainedModel):
         >>> # Generate
         >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
         >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
-        'This is an example script .\n\n\n\nfrom typing import List\n\ndef find_most_common_letter(words: List[str'
         ```"""
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions

     add_code_sample_docstrings,
     add_start_docstrings,
     add_start_docstrings_to_model_forward,
+    is_flash_attn_2_available,
     is_flash_attn_greater_or_equal_2_10,
     logging,
     replace_return_docstrings,
 _flash_supports_window_size = False
 try:
     from flash_attn import flash_attn_func, flash_attn_varlen_func
+    from flash_attn.bert_padding import index_first_axis, pad_input, unpad_input  # noqa
     _flash_supports_window_size = "window_size" in list(inspect.signature(flash_attn_func).parameters)
 except ImportError as error:
     logger.warning(
+        f"`flash-attention` package not found, consider installing for better performance: {error}."
     )
     if not _flash_supports_window_size:
         logger.warning(
+            "Current `flash-attenton` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`."
         )
 _CHECKPOINT_FOR_DOC = "microsoft/Phi-3-mini-4k-instruct"
 _CONFIG_FOR_DOC = "Phi3Config"
         return self.weight * hidden_states.to(input_dtype)
 # Copied from transformers.models.llama.modeling_llama._get_unpad_data
 def _get_unpad_data(attention_mask):
     seqlens_in_batch = attention_mask.sum(dim=-1, dtype=torch.int32)
     )
+# Copied from transformers.models.gemma.modeling_gemma.GemmaRotaryEmbedding with gemma->phi3, Gemma->Phi3
 class Phi3RotaryEmbedding(nn.Module):
     def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
         super().__init__()
         self.dim = dim
         self.max_position_embeddings = max_position_embeddings
         self.base = base
+        self.register_buffer("inv_freq", None, persistent=False)
+    @torch.no_grad()
+    def forward(self, x, position_ids, seq_len=None):
         # x: [bs, num_attention_heads, seq_len, head_size]
+        if self.inv_freq is None:
+            self.inv_freq = 1.0 / (
+                self.base ** (torch.arange(0, self.dim, 2, dtype=torch.int64, device=x.device).float() / self.dim)
+            )
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        position_ids_expanded = position_ids[:, None, :].float()
+        # Force float32 since bfloat16 loses precision on long contexts
+        # See https://github.com/huggingface/transformers/pull/29285
+        device_type = x.device.type
+        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos()
+            sin = emb.sin()
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+class Phi3SuScaledRotaryEmbedding(Phi3RotaryEmbedding):
+    def __init__(self, dim, config, device=None):
+        super().__init__(dim, config.max_position_embeddings, config.rope_theta, device)
+        self.short_factor = config.rope_scaling["short_factor"]
+        self.long_factor = config.rope_scaling["long_factor"]
+        self.original_max_position_embeddings = config.original_max_position_embeddings
+    @torch.no_grad()
+    def forward(self, x, position_ids, seq_len=None):
+        seq_len = torch.max(position_ids) + 1
+        if seq_len > self.original_max_position_embeddings:
+            ext_factors = torch.tensor(self.long_factor, dtype=torch.float32, device=x.device)
+        else:
+            ext_factors = torch.tensor(self.short_factor, dtype=torch.float32, device=x.device)
+        inv_freq_shape = torch.arange(0, self.dim, 2, dtype=torch.int64, device=x.device).float() / self.dim
+        self.inv_freq = 1.0 / (ext_factors * self.base**inv_freq_shape)
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        position_ids_expanded = position_ids[:, None, :].float()
+        # Force float32 since bfloat16 loses precision on long contexts
+        # See https://github.com/huggingface/transformers/pull/29285
+        device_type = x.device.type
+        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            scale = self.max_position_embeddings / self.original_max_position_embeddings
+            if scale <= 1.0:
+                scaling_factor = 1.0
+            else:
+                scaling_factor = math.sqrt(1 + math.log(scale) / math.log(self.original_max_position_embeddings))
+            cos = emb.cos() * scaling_factor
+            sin = emb.sin() * scaling_factor
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+class Phi3YarnScaledRotaryEmbedding(Phi3RotaryEmbedding):
+    def __init__(self, dim, config, device=None):
+        super().__init__(dim, config.max_position_embeddings, config.rope_theta, device)
+        self.short_factor = config.rope_scaling["short_factor"]
+        self.long_factor = config.rope_scaling["long_factor"]
+        self.original_max_position_embeddings = config.original_max_position_embeddings
+    @torch.no_grad()
+    def forward(self, x, position_ids, seq_len=None):
+        seq_len = torch.max(position_ids) + 1
         if seq_len > self.original_max_position_embeddings:
+            ext_factors = torch.tensor(self.long_factor, dtype=torch.float32, device=x.device)
         else:
+            ext_factors = torch.tensor(self.short_factor, dtype=torch.float32, device=x.device)
+        inv_freq_shape = torch.arange(0, self.dim, 2, dtype=torch.int64, device=x.device).float() / self.dim
+        self.inv_freq = 1.0 / (ext_factors * self.base**inv_freq_shape)
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        position_ids_expanded = position_ids[:, None, :].float()
+        # Force float32 since bfloat16 loses precision on long contexts
+        # See https://github.com/huggingface/transformers/pull/29285
+        device_type = x.device.type
+        device_type = device_type if isinstance(device_type, str) and device_type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            scale = self.max_position_embeddings / self.original_max_position_embeddings
+            if scale <= 1.0:
+                scaling_factor = 1.0
+            else:
+                scaling_factor = 0.1 * math.log(scale) + 1.0
+            cos = emb.cos() * scaling_factor
+            sin = emb.sin() * scaling_factor
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
 # Copied from transformers.models.llama.modeling_llama.rotate_half
     return torch.cat((-x2, x1), dim=-1)
+# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
     """Applies Rotary Position Embedding to the query and key tensors.
     Args:
         k (`torch.Tensor`): The key tensor.
         cos (`torch.Tensor`): The cosine part of the rotary embedding.
         sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
         unsqueeze_dim (`int`, *optional*, defaults to 1):
             The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
             sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
     Returns:
         `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
     """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
 class Phi3MLP(nn.Module):
     def __init__(self, config):
         super().__init__()
         self.activation_fn = ACT2FN[config.hidden_act]
     def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor:
+        up_states = self.gate_up_proj(hidden_states)
+        gate, up_states = up_states.chunk(2, dim=-1)
+        up_states = up_states * self.activation_fn(gate)
+        return self.down_proj(up_states)
 # Copied from transformers.models.llama.modeling_llama.repeat_kv with llama->phi
         op_size = self.num_heads * self.head_dim + 2 * (self.num_key_value_heads * self.head_dim)
         self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
         self.qkv_proj = nn.Linear(self.hidden_size, op_size, bias=False)
+        self._init_rope()
+    def _init_rope(self):
         if self.rope_scaling is None:
             self.rotary_emb = Phi3RotaryEmbedding(
                 self.head_dim,
                 base=self.rope_theta,
             )
         else:
+            scaling_type = self.config.rope_scaling["type"]
+            if scaling_type == "su":
+                self.rotary_emb = Phi3SuScaledRotaryEmbedding(self.head_dim, self.config)
+            elif scaling_type == "yarn":
+                self.rotary_emb = Phi3YarnScaledRotaryEmbedding(self.head_dim, self.config)
+            else:
+                raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
     def forward(
         self,
                     "with a layer index."
                 )
             kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+        cos, sin = self.rotary_emb(value_states, position_ids, seq_len=kv_seq_len)
         query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
         if past_key_value is not None:
         # Because the input can be padded, the absolute sequence length depends on the max position id.
         rotary_seq_len = max(kv_seq_len, position_ids[:, -1].max().item()) + 1
+        cos, sin = self.rotary_emb(value_states, position_ids, seq_len=rotary_seq_len)
         query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
         kv_seq_len = key_states.shape[-2]
         if past_key_value is not None:
             kv_seq_len += past_key_value.get_usable_length(kv_seq_len, self.layer_idx)
+        cos, sin = self.rotary_emb(value_states, position_ids, seq_len=kv_seq_len)
         query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
         self.self_attn = PHI3_ATTENTION_CLASSES[config._attn_implementation](config, layer_idx=layer_idx)
         self.mlp = Phi3MLP(config)
+        self.input_layernorm = Phi3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.resid_attn_dropout = nn.Dropout(config.resid_pdrop)
         self.resid_mlp_dropout = nn.Dropout(config.resid_pdrop)
+        self.post_attention_layernorm = Phi3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
     def forward(
         self,
         self.layers = nn.ModuleList(
             [Phi3DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
         )
         self._attn_implementation = config._attn_implementation
+        self.norm = Phi3RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
         self.gradient_checkpointing = False
         # Initialize weights and apply final processing
     def get_decoder(self):
         return self.model
+    # Ignore copy
     @add_start_docstrings_to_model_forward(PHI3_INPUTS_DOCSTRING)
     @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
     def forward(
         ```python
         >>> from transformers import AutoTokenizer, Phi3ForCausalLM
+        >>> model = Phi3ForCausalLM.from_pretrained("microsoft/phi-3-mini-4k-instruct")
+        >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-3-mini-4k-instruct")
         >>> prompt = "This is an example script ."
         >>> inputs = tokenizer(prompt, return_tensors="pt")
         >>> # Generate
         >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
         >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        'This is an example script .\n Certainly! Below is a sample script that demonstrates a simple task, such as calculating the sum'
         ```"""
         output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions

sample_finetune.py CHANGED Viewed

@@ -25,7 +25,6 @@ check accelerate config:
 args = {
     "bf16": True,
     "do_eval": False,
-    "eval_strategy": "no",
     "learning_rate": 5.0e-06,
     "log_level": "info",
     "logging_steps": 20,

 args = {
     "bf16": True,
     "do_eval": False,
     "learning_rate": 5.0e-06,
     "log_level": "info",
     "logging_steps": 20,

tokenizer_config.json CHANGED Viewed

@@ -335,7 +335,7 @@
     "<|/inst|>"
   ],
   "bos_token": "<s>",
-  "chat_template": "{{ bos_token }}{% for message in messages %}{{'<|' + message['role'] + '|>' + '\n' + message['content'] + '<|end|>\n' }}{% endfor %}{% if add_generation_prompt %}{{ '<|assistant|>\n' }}{% else %}{{ eos_token }}{% endif %}",
   "clean_up_tokenization_spaces": false,
   "eos_token": "<|endoftext|>",
   "legacy": false,

     "<|/inst|>"
   ],
   "bos_token": "<s>",
+  "chat_template": "{{ bos_token }}{% for message in messages %}{% if (message['role'] == 'system') %}{{'<|system|>' + '\n' + message['content'] + '<|end|>' + '\n'}}{% elif (message['role'] == 'user') %}{{'<|user|>' + '\n' + message['content'] + '<|end|>' + '\n' + '<|assistant|>' + '\n'}}{% elif message['role'] == 'assistant' %}{{message['content'] + '<|end|>' + '\n'}}{% endif %}{% endfor %}",
   "clean_up_tokenization_spaces": false,
   "eos_token": "<|endoftext|>",
   "legacy": false,