Upload folder using huggingface_hub

Browse files

Files changed (6) hide show

config.json +5 -5
example.py +1 -1
hf_nemotron_parse_config.py +7 -7
hf_nemotron_parse_modeling.py +8 -8
hf_nemotron_parse_processor.py +10 -10
preprocessor_config.json +5 -5

config.json CHANGED Viewed

@@ -1,12 +1,12 @@
 {
   "architectures": [
-    "NemotronParseLightForConditionalGeneration"
   ],
   "auto_map": {
-    "AutoConfig": "hf_nemotron_parse_config.NemotronParseLightConfig",
-    "AutoModel": "hf_nemotron_parse_modeling.NemotronParseLightForConditionalGeneration",
-    "AutoImageProcessor": "hf_nemotron_parse_processor.NemotronParseLightImageProcessor",
-    "AutoProcessor": "hf_nemotron_parse_processor.NemotronParseLightProcessor"
   },
   "bos_token_id": 0,
   "decoder": {

 {
   "architectures": [
+    "NemotronParseTCForConditionalGeneration"
   ],
   "auto_map": {
+    "AutoConfig": "hf_nemotron_parse_config.NemotronParseTCConfig",
+    "AutoModel": "hf_nemotron_parse_modeling.NemotronParseTCForConditionalGeneration",
+    "AutoImageProcessor": "hf_nemotron_parse_processor.NemotronParseTCImageProcessor",
+    "AutoProcessor": "hf_nemotron_parse_processor.NemotronParseTCProcessor"
   },
   "bos_token_id": 0,
   "decoder": {

example.py CHANGED Viewed

@@ -4,7 +4,7 @@ from transformers import AutoModel, AutoProcessor, AutoTokenizer, AutoConfig, Au
 from postprocessing import extract_classes_bboxes, transform_bbox_to_original, postprocess_text
 # Load model and processor
-model_path = "nvidia/NVIDIA-Nemotron-Parse-v1.1-Light"  # Or use a local path
 device = "cuda:0"
 model = AutoModel.from_pretrained(

 from postprocessing import extract_classes_bboxes, transform_bbox_to_original, postprocess_text
 # Load model and processor
+model_path = "nvidia/NVIDIA-Nemotron-Parse-v1.1-TC"  # Or use a local path
 device = "cuda:0"
 model = AutoModel.from_pretrained(

hf_nemotron_parse_config.py CHANGED Viewed

@@ -5,9 +5,9 @@ from typing import List, Optional
 from transformers.dynamic_module_utils import get_class_from_dynamic_module
-class NemotronParseLightTextConfig(PretrainedConfig):
     """
-    Configuration class for NemotronParseLight text decoder (mBART-based).
     """
     model_type = "nemotron_parse_text"
@@ -69,12 +69,12 @@ class NemotronParseLightTextConfig(PretrainedConfig):
         self.max_sequence_length = max_sequence_length
-class NemotronParseLightConfig(PretrainedConfig):
     """
-    Configuration class for NemotronParseLight model.
-    This configuration class is used to store the configuration of a [`NemotronParseLightForConditionalGeneration`] model.
-    It is used to instantiate an NemotronParseLight model according to the specified arguments, defining the vision and text model configs.
     """
     model_type = "nemotron_parse"
     is_composition = True
@@ -116,7 +116,7 @@ class NemotronParseLightConfig(PretrainedConfig):
             self.encoder = PretrainedConfig()
         decoder["max_sequence_length"] = max_sequence_length
-        self.decoder = NemotronParseLightTextConfig(**decoder)
         self.image_size = image_size
         # Initialize vocab size from text config

 from transformers.dynamic_module_utils import get_class_from_dynamic_module
+class NemotronParseTCTextConfig(PretrainedConfig):
     """
+    Configuration class for NemotronParseTC text decoder (mBART-based).
     """
     model_type = "nemotron_parse_text"
         self.max_sequence_length = max_sequence_length
+class NemotronParseTCConfig(PretrainedConfig):
     """
+    Configuration class for NemotronParseTC model.
+    This configuration class is used to store the configuration of a [`NemotronParseTCForConditionalGeneration`] model.
+    It is used to instantiate an NemotronParseTC model according to the specified arguments, defining the vision and text model configs.
     """
     model_type = "nemotron_parse"
     is_composition = True
             self.encoder = PretrainedConfig()
         decoder["max_sequence_length"] = max_sequence_length
+        self.decoder = NemotronParseTCTextConfig(**decoder)
         self.image_size = image_size
         # Initialize vocab size from text config

hf_nemotron_parse_modeling.py CHANGED Viewed

@@ -13,7 +13,7 @@ from typing import Optional, List, Union, Tuple
 import warnings
 from transformers.modeling_outputs import BaseModelOutput
 from transformers.models.encoder_decoder.modeling_encoder_decoder import shift_tokens_right
-from .hf_nemotron_parse_config import NemotronParseLightConfig
 from transformers import AutoModel
 import time
 from transformers.modeling_attn_mask_utils import (
@@ -56,7 +56,7 @@ def pixel_shuffle(x, scale_factor=0.5, version=2):
     return x
-class NemotronParseLightDecoder(MBartPreTrainedModel):
     """
     Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`MBartDecoderLayer`]
@@ -376,11 +376,11 @@ class RadioWithNeck(nn.Module):
         return DonutSwinModelOutput(last_hidden_state=output)
-class NemotronParseLightPreTrainedModel(PreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models.
     """
-    config_class = NemotronParseLightConfig
     base_model_prefix = "vision_encoder_decoder"  # Use VisionEncoderDecoder prefix
     main_input_name = "pixel_values"
     supports_gradient_checkpointing = True
@@ -399,21 +399,21 @@ class NemotronParseLightPreTrainedModel(PreTrainedModel):
                 module.weight.data[module.padding_idx].zero_()
 # Based on transformers.models.encoder_decoder.modeling_encoder_decoder
-class NemotronParseLightForConditionalGeneration(NemotronParseLightPreTrainedModel, GenerationMixin):
     """
-    NemotronParseLight model for conditional generation tasks.
     This model combines a RADIO-based vision encoder with an mBART-based text decoder.
     """
-    def __init__(self, config: NemotronParseLightConfig):
         super().__init__(config)
         self.encoder = RadioWithNeck(config.encoder)
         self.encoder.main_input_name = 'pixel_values'
         self.encoder = self.encoder.to(config.encoder.torch_dtype)
-        self.decoder = NemotronParseLightDecoder(config.decoder)
         self.decoder = self.decoder.to(config.decoder.torch_dtype)
         self.lm_head = nn.Linear(config.decoder.d_model, config.decoder.vocab_size, bias=False, dtype=config.decoder.torch_dtype)

 import warnings
 from transformers.modeling_outputs import BaseModelOutput
 from transformers.models.encoder_decoder.modeling_encoder_decoder import shift_tokens_right
+from .hf_nemotron_parse_config import NemotronParseTCConfig
 from transformers import AutoModel
 import time
 from transformers.modeling_attn_mask_utils import (
     return x
+class NemotronParseTCDecoder(MBartPreTrainedModel):
     """
     Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`MBartDecoderLayer`]
         return DonutSwinModelOutput(last_hidden_state=output)
+class NemotronParseTCPreTrainedModel(PreTrainedModel):
     """
     An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models.
     """
+    config_class = NemotronParseTCConfig
     base_model_prefix = "vision_encoder_decoder"  # Use VisionEncoderDecoder prefix
     main_input_name = "pixel_values"
     supports_gradient_checkpointing = True
                 module.weight.data[module.padding_idx].zero_()
 # Based on transformers.models.encoder_decoder.modeling_encoder_decoder
+class NemotronParseTCForConditionalGeneration(NemotronParseTCPreTrainedModel, GenerationMixin):
     """
+    NemotronParseTC model for conditional generation tasks.
     This model combines a RADIO-based vision encoder with an mBART-based text decoder.
     """
+    def __init__(self, config: NemotronParseTCConfig):
         super().__init__(config)
         self.encoder = RadioWithNeck(config.encoder)
         self.encoder.main_input_name = 'pixel_values'
         self.encoder = self.encoder.to(config.encoder.torch_dtype)
+        self.decoder = NemotronParseTCDecoder(config.decoder)
         self.decoder = self.decoder.to(config.decoder.torch_dtype)
         self.lm_head = nn.Linear(config.decoder.d_model, config.decoder.vocab_size, bias=False, dtype=config.decoder.torch_dtype)

hf_nemotron_parse_processor.py CHANGED Viewed

@@ -13,9 +13,9 @@ from transformers.image_utils import ChannelDimension, ImageInput, PILImageResam
 from transformers.utils import TensorType
-class NemotronParseLightImageProcessor(BaseImageProcessor, ImageProcessingMixin):
     """
-    Image processor for NemotronParseLight model.
     This processor inherits from BaseImageProcessor to be compatible with transformers AutoImageProcessor.
     """
@@ -118,9 +118,9 @@ class NemotronParseLightImageProcessor(BaseImageProcessor, ImageProcessingMixin)
         # Save preprocessor config in standard HuggingFace format
         config = {
-            "feature_extractor_type": "NemotronParseLightImageProcessor",
-            "image_processor_type": "NemotronParseLightImageProcessor",
-            "processor_class": "NemotronParseLightImageProcessor",
             "size": {
                 "height": self.final_size[0],
                 "width": self.final_size[1],
@@ -189,7 +189,7 @@ class NemotronParseLightImageProcessor(BaseImageProcessor, ImageProcessingMixin)
         **kwargs,
     ) -> Dict[str, torch.Tensor]:
         """
-        Preprocess an image or batch of images for the NemotronParseLight model.
         Args:
             images: Input image(s)
@@ -211,7 +211,7 @@ class NemotronParseLightImageProcessor(BaseImageProcessor, ImageProcessingMixin)
                 image = np.asarray(image)
             processed_images.append(image)
-        # Apply NemotronParseLight-specific transforms
         pixel_values = []
         for image in processed_images:
             processed_image = self._resize_with_aspect_ratio(image)
@@ -253,7 +253,7 @@ class NemotronParseLightImageProcessor(BaseImageProcessor, ImageProcessingMixin)
         return self.preprocess(images, **kwargs)
-class NemotronParseLightProcessor(ProcessorMixin):
     attributes = ["image_processor", "tokenizer"]
     image_processor_class = "AutoImageProcessor"
@@ -261,7 +261,7 @@ class NemotronParseLightProcessor(ProcessorMixin):
     def __init__(self, image_processor=None, tokenizer=None, **kwargs):
         if image_processor is None:
-            image_processor = NemotronParseLightImageProcessor(**kwargs)
         super().__init__(image_processor, tokenizer)
@@ -392,5 +392,5 @@ class NemotronParseLightProcessor(ProcessorMixin):
         # Use the parent class's save_pretrained method for processor config
         super().save_pretrained(save_directory, **kwargs)
-        print(f"NemotronParseLightProcessor saved to {save_directory}")
         print(f"AutoTokenizer.from_pretrained('{save_directory}') should now work!")

 from transformers.utils import TensorType
+class NemotronParseTCImageProcessor(BaseImageProcessor, ImageProcessingMixin):
     """
+    Image processor for NemotronParseTC model.
     This processor inherits from BaseImageProcessor to be compatible with transformers AutoImageProcessor.
     """
         # Save preprocessor config in standard HuggingFace format
         config = {
+            "feature_extractor_type": "NemotronParseTCImageProcessor",
+            "image_processor_type": "NemotronParseTCImageProcessor",
+            "processor_class": "NemotronParseTCImageProcessor",
             "size": {
                 "height": self.final_size[0],
                 "width": self.final_size[1],
         **kwargs,
     ) -> Dict[str, torch.Tensor]:
         """
+        Preprocess an image or batch of images for the NemotronParseTC model.
         Args:
             images: Input image(s)
                 image = np.asarray(image)
             processed_images.append(image)
+        # Apply NemotronParseTC-specific transforms
         pixel_values = []
         for image in processed_images:
             processed_image = self._resize_with_aspect_ratio(image)
         return self.preprocess(images, **kwargs)
+class NemotronParseTCProcessor(ProcessorMixin):
     attributes = ["image_processor", "tokenizer"]
     image_processor_class = "AutoImageProcessor"
     def __init__(self, image_processor=None, tokenizer=None, **kwargs):
         if image_processor is None:
+            image_processor = NemotronParseTCImageProcessor(**kwargs)
         super().__init__(image_processor, tokenizer)
         # Use the parent class's save_pretrained method for processor config
         super().save_pretrained(save_directory, **kwargs)
+        print(f"NemotronParseTCProcessor saved to {save_directory}")
         print(f"AutoTokenizer.from_pretrained('{save_directory}') should now work!")

preprocessor_config.json CHANGED Viewed

@@ -1,10 +1,10 @@
 {
-  "feature_extractor_type": "NemotronParseLightImageProcessor",
-  "image_processor_type": "NemotronParseLightImageProcessor",
-  "processor_class": "NemotronParseLightProcessor",
   "auto_map": {
-	  "AutoImageProcessor": "hf_nemotron_parse_processor.NemotronParseLightImageProcessor",
-	  "AutoProcessor": "hf_nemotron_parse_processor.NemotronParseLightProcessor"
   },
   "do_normalize": false,
   "do_rescale": true,

 {
+  "feature_extractor_type": "NemotronParseTCImageProcessor",
+  "image_processor_type": "NemotronParseTCImageProcessor",
+  "processor_class": "NemotronParseTCProcessor",
   "auto_map": {
+	  "AutoImageProcessor": "hf_nemotron_parse_processor.NemotronParseTCImageProcessor",
+	  "AutoProcessor": "hf_nemotron_parse_processor.NemotronParseTCProcessor"
   },
   "do_normalize": false,
   "do_rescale": true,