katerynaCh commited on
Commit
39d3da1
·
verified ·
1 Parent(s): 57b650d

Upload folder using huggingface_hub

Browse files
config.json CHANGED
@@ -1,12 +1,12 @@
1
  {
2
  "architectures": [
3
- "NemotronParseLightForConditionalGeneration"
4
  ],
5
  "auto_map": {
6
- "AutoConfig": "hf_nemotron_parse_config.NemotronParseLightConfig",
7
- "AutoModel": "hf_nemotron_parse_modeling.NemotronParseLightForConditionalGeneration",
8
- "AutoImageProcessor": "hf_nemotron_parse_processor.NemotronParseLightImageProcessor",
9
- "AutoProcessor": "hf_nemotron_parse_processor.NemotronParseLightProcessor"
10
  },
11
  "bos_token_id": 0,
12
  "decoder": {
 
1
  {
2
  "architectures": [
3
+ "NemotronParseTCForConditionalGeneration"
4
  ],
5
  "auto_map": {
6
+ "AutoConfig": "hf_nemotron_parse_config.NemotronParseTCConfig",
7
+ "AutoModel": "hf_nemotron_parse_modeling.NemotronParseTCForConditionalGeneration",
8
+ "AutoImageProcessor": "hf_nemotron_parse_processor.NemotronParseTCImageProcessor",
9
+ "AutoProcessor": "hf_nemotron_parse_processor.NemotronParseTCProcessor"
10
  },
11
  "bos_token_id": 0,
12
  "decoder": {
example.py CHANGED
@@ -4,7 +4,7 @@ from transformers import AutoModel, AutoProcessor, AutoTokenizer, AutoConfig, Au
4
  from postprocessing import extract_classes_bboxes, transform_bbox_to_original, postprocess_text
5
 
6
  # Load model and processor
7
- model_path = "nvidia/NVIDIA-Nemotron-Parse-v1.1-Light" # Or use a local path
8
  device = "cuda:0"
9
 
10
  model = AutoModel.from_pretrained(
 
4
  from postprocessing import extract_classes_bboxes, transform_bbox_to_original, postprocess_text
5
 
6
  # Load model and processor
7
+ model_path = "nvidia/NVIDIA-Nemotron-Parse-v1.1-TC" # Or use a local path
8
  device = "cuda:0"
9
 
10
  model = AutoModel.from_pretrained(
hf_nemotron_parse_config.py CHANGED
@@ -5,9 +5,9 @@ from typing import List, Optional
5
 
6
  from transformers.dynamic_module_utils import get_class_from_dynamic_module
7
 
8
- class NemotronParseLightTextConfig(PretrainedConfig):
9
  """
10
- Configuration class for NemotronParseLight text decoder (mBART-based).
11
  """
12
  model_type = "nemotron_parse_text"
13
 
@@ -69,12 +69,12 @@ class NemotronParseLightTextConfig(PretrainedConfig):
69
  self.max_sequence_length = max_sequence_length
70
 
71
 
72
- class NemotronParseLightConfig(PretrainedConfig):
73
  """
74
- Configuration class for NemotronParseLight model.
75
 
76
- This configuration class is used to store the configuration of a [`NemotronParseLightForConditionalGeneration`] model.
77
- It is used to instantiate an NemotronParseLight model according to the specified arguments, defining the vision and text model configs.
78
  """
79
  model_type = "nemotron_parse"
80
  is_composition = True
@@ -116,7 +116,7 @@ class NemotronParseLightConfig(PretrainedConfig):
116
  self.encoder = PretrainedConfig()
117
 
118
  decoder["max_sequence_length"] = max_sequence_length
119
- self.decoder = NemotronParseLightTextConfig(**decoder)
120
  self.image_size = image_size
121
 
122
  # Initialize vocab size from text config
 
5
 
6
  from transformers.dynamic_module_utils import get_class_from_dynamic_module
7
 
8
+ class NemotronParseTCTextConfig(PretrainedConfig):
9
  """
10
+ Configuration class for NemotronParseTC text decoder (mBART-based).
11
  """
12
  model_type = "nemotron_parse_text"
13
 
 
69
  self.max_sequence_length = max_sequence_length
70
 
71
 
72
+ class NemotronParseTCConfig(PretrainedConfig):
73
  """
74
+ Configuration class for NemotronParseTC model.
75
 
76
+ This configuration class is used to store the configuration of a [`NemotronParseTCForConditionalGeneration`] model.
77
+ It is used to instantiate an NemotronParseTC model according to the specified arguments, defining the vision and text model configs.
78
  """
79
  model_type = "nemotron_parse"
80
  is_composition = True
 
116
  self.encoder = PretrainedConfig()
117
 
118
  decoder["max_sequence_length"] = max_sequence_length
119
+ self.decoder = NemotronParseTCTextConfig(**decoder)
120
  self.image_size = image_size
121
 
122
  # Initialize vocab size from text config
hf_nemotron_parse_modeling.py CHANGED
@@ -13,7 +13,7 @@ from typing import Optional, List, Union, Tuple
13
  import warnings
14
  from transformers.modeling_outputs import BaseModelOutput
15
  from transformers.models.encoder_decoder.modeling_encoder_decoder import shift_tokens_right
16
- from .hf_nemotron_parse_config import NemotronParseLightConfig
17
  from transformers import AutoModel
18
  import time
19
  from transformers.modeling_attn_mask_utils import (
@@ -56,7 +56,7 @@ def pixel_shuffle(x, scale_factor=0.5, version=2):
56
 
57
  return x
58
 
59
- class NemotronParseLightDecoder(MBartPreTrainedModel):
60
  """
61
  Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`MBartDecoderLayer`]
62
 
@@ -376,11 +376,11 @@ class RadioWithNeck(nn.Module):
376
  return DonutSwinModelOutput(last_hidden_state=output)
377
 
378
 
379
- class NemotronParseLightPreTrainedModel(PreTrainedModel):
380
  """
381
  An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models.
382
  """
383
- config_class = NemotronParseLightConfig
384
  base_model_prefix = "vision_encoder_decoder" # Use VisionEncoderDecoder prefix
385
  main_input_name = "pixel_values"
386
  supports_gradient_checkpointing = True
@@ -399,21 +399,21 @@ class NemotronParseLightPreTrainedModel(PreTrainedModel):
399
  module.weight.data[module.padding_idx].zero_()
400
 
401
  # Based on transformers.models.encoder_decoder.modeling_encoder_decoder
402
- class NemotronParseLightForConditionalGeneration(NemotronParseLightPreTrainedModel, GenerationMixin):
403
  """
404
- NemotronParseLight model for conditional generation tasks.
405
 
406
  This model combines a RADIO-based vision encoder with an mBART-based text decoder.
407
  """
408
 
409
- def __init__(self, config: NemotronParseLightConfig):
410
  super().__init__(config)
411
 
412
  self.encoder = RadioWithNeck(config.encoder)
413
  self.encoder.main_input_name = 'pixel_values'
414
  self.encoder = self.encoder.to(config.encoder.torch_dtype)
415
 
416
- self.decoder = NemotronParseLightDecoder(config.decoder)
417
  self.decoder = self.decoder.to(config.decoder.torch_dtype)
418
 
419
  self.lm_head = nn.Linear(config.decoder.d_model, config.decoder.vocab_size, bias=False, dtype=config.decoder.torch_dtype)
 
13
  import warnings
14
  from transformers.modeling_outputs import BaseModelOutput
15
  from transformers.models.encoder_decoder.modeling_encoder_decoder import shift_tokens_right
16
+ from .hf_nemotron_parse_config import NemotronParseTCConfig
17
  from transformers import AutoModel
18
  import time
19
  from transformers.modeling_attn_mask_utils import (
 
56
 
57
  return x
58
 
59
+ class NemotronParseTCDecoder(MBartPreTrainedModel):
60
  """
61
  Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`MBartDecoderLayer`]
62
 
 
376
  return DonutSwinModelOutput(last_hidden_state=output)
377
 
378
 
379
+ class NemotronParseTCPreTrainedModel(PreTrainedModel):
380
  """
381
  An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models.
382
  """
383
+ config_class = NemotronParseTCConfig
384
  base_model_prefix = "vision_encoder_decoder" # Use VisionEncoderDecoder prefix
385
  main_input_name = "pixel_values"
386
  supports_gradient_checkpointing = True
 
399
  module.weight.data[module.padding_idx].zero_()
400
 
401
  # Based on transformers.models.encoder_decoder.modeling_encoder_decoder
402
+ class NemotronParseTCForConditionalGeneration(NemotronParseTCPreTrainedModel, GenerationMixin):
403
  """
404
+ NemotronParseTC model for conditional generation tasks.
405
 
406
  This model combines a RADIO-based vision encoder with an mBART-based text decoder.
407
  """
408
 
409
+ def __init__(self, config: NemotronParseTCConfig):
410
  super().__init__(config)
411
 
412
  self.encoder = RadioWithNeck(config.encoder)
413
  self.encoder.main_input_name = 'pixel_values'
414
  self.encoder = self.encoder.to(config.encoder.torch_dtype)
415
 
416
+ self.decoder = NemotronParseTCDecoder(config.decoder)
417
  self.decoder = self.decoder.to(config.decoder.torch_dtype)
418
 
419
  self.lm_head = nn.Linear(config.decoder.d_model, config.decoder.vocab_size, bias=False, dtype=config.decoder.torch_dtype)
hf_nemotron_parse_processor.py CHANGED
@@ -13,9 +13,9 @@ from transformers.image_utils import ChannelDimension, ImageInput, PILImageResam
13
  from transformers.utils import TensorType
14
 
15
 
16
- class NemotronParseLightImageProcessor(BaseImageProcessor, ImageProcessingMixin):
17
  """
18
- Image processor for NemotronParseLight model.
19
 
20
  This processor inherits from BaseImageProcessor to be compatible with transformers AutoImageProcessor.
21
  """
@@ -118,9 +118,9 @@ class NemotronParseLightImageProcessor(BaseImageProcessor, ImageProcessingMixin)
118
 
119
  # Save preprocessor config in standard HuggingFace format
120
  config = {
121
- "feature_extractor_type": "NemotronParseLightImageProcessor",
122
- "image_processor_type": "NemotronParseLightImageProcessor",
123
- "processor_class": "NemotronParseLightImageProcessor",
124
  "size": {
125
  "height": self.final_size[0],
126
  "width": self.final_size[1],
@@ -189,7 +189,7 @@ class NemotronParseLightImageProcessor(BaseImageProcessor, ImageProcessingMixin)
189
  **kwargs,
190
  ) -> Dict[str, torch.Tensor]:
191
  """
192
- Preprocess an image or batch of images for the NemotronParseLight model.
193
 
194
  Args:
195
  images: Input image(s)
@@ -211,7 +211,7 @@ class NemotronParseLightImageProcessor(BaseImageProcessor, ImageProcessingMixin)
211
  image = np.asarray(image)
212
  processed_images.append(image)
213
 
214
- # Apply NemotronParseLight-specific transforms
215
  pixel_values = []
216
  for image in processed_images:
217
  processed_image = self._resize_with_aspect_ratio(image)
@@ -253,7 +253,7 @@ class NemotronParseLightImageProcessor(BaseImageProcessor, ImageProcessingMixin)
253
  return self.preprocess(images, **kwargs)
254
 
255
 
256
- class NemotronParseLightProcessor(ProcessorMixin):
257
 
258
  attributes = ["image_processor", "tokenizer"]
259
  image_processor_class = "AutoImageProcessor"
@@ -261,7 +261,7 @@ class NemotronParseLightProcessor(ProcessorMixin):
261
 
262
  def __init__(self, image_processor=None, tokenizer=None, **kwargs):
263
  if image_processor is None:
264
- image_processor = NemotronParseLightImageProcessor(**kwargs)
265
 
266
  super().__init__(image_processor, tokenizer)
267
 
@@ -392,5 +392,5 @@ class NemotronParseLightProcessor(ProcessorMixin):
392
 
393
  # Use the parent class's save_pretrained method for processor config
394
  super().save_pretrained(save_directory, **kwargs)
395
- print(f"NemotronParseLightProcessor saved to {save_directory}")
396
  print(f"AutoTokenizer.from_pretrained('{save_directory}') should now work!")
 
13
  from transformers.utils import TensorType
14
 
15
 
16
+ class NemotronParseTCImageProcessor(BaseImageProcessor, ImageProcessingMixin):
17
  """
18
+ Image processor for NemotronParseTC model.
19
 
20
  This processor inherits from BaseImageProcessor to be compatible with transformers AutoImageProcessor.
21
  """
 
118
 
119
  # Save preprocessor config in standard HuggingFace format
120
  config = {
121
+ "feature_extractor_type": "NemotronParseTCImageProcessor",
122
+ "image_processor_type": "NemotronParseTCImageProcessor",
123
+ "processor_class": "NemotronParseTCImageProcessor",
124
  "size": {
125
  "height": self.final_size[0],
126
  "width": self.final_size[1],
 
189
  **kwargs,
190
  ) -> Dict[str, torch.Tensor]:
191
  """
192
+ Preprocess an image or batch of images for the NemotronParseTC model.
193
 
194
  Args:
195
  images: Input image(s)
 
211
  image = np.asarray(image)
212
  processed_images.append(image)
213
 
214
+ # Apply NemotronParseTC-specific transforms
215
  pixel_values = []
216
  for image in processed_images:
217
  processed_image = self._resize_with_aspect_ratio(image)
 
253
  return self.preprocess(images, **kwargs)
254
 
255
 
256
+ class NemotronParseTCProcessor(ProcessorMixin):
257
 
258
  attributes = ["image_processor", "tokenizer"]
259
  image_processor_class = "AutoImageProcessor"
 
261
 
262
  def __init__(self, image_processor=None, tokenizer=None, **kwargs):
263
  if image_processor is None:
264
+ image_processor = NemotronParseTCImageProcessor(**kwargs)
265
 
266
  super().__init__(image_processor, tokenizer)
267
 
 
392
 
393
  # Use the parent class's save_pretrained method for processor config
394
  super().save_pretrained(save_directory, **kwargs)
395
+ print(f"NemotronParseTCProcessor saved to {save_directory}")
396
  print(f"AutoTokenizer.from_pretrained('{save_directory}') should now work!")
preprocessor_config.json CHANGED
@@ -1,10 +1,10 @@
1
  {
2
- "feature_extractor_type": "NemotronParseLightImageProcessor",
3
- "image_processor_type": "NemotronParseLightImageProcessor",
4
- "processor_class": "NemotronParseLightProcessor",
5
  "auto_map": {
6
- "AutoImageProcessor": "hf_nemotron_parse_processor.NemotronParseLightImageProcessor",
7
- "AutoProcessor": "hf_nemotron_parse_processor.NemotronParseLightProcessor"
8
  },
9
  "do_normalize": false,
10
  "do_rescale": true,
 
1
  {
2
+ "feature_extractor_type": "NemotronParseTCImageProcessor",
3
+ "image_processor_type": "NemotronParseTCImageProcessor",
4
+ "processor_class": "NemotronParseTCProcessor",
5
  "auto_map": {
6
+ "AutoImageProcessor": "hf_nemotron_parse_processor.NemotronParseTCImageProcessor",
7
+ "AutoProcessor": "hf_nemotron_parse_processor.NemotronParseTCProcessor"
8
  },
9
  "do_normalize": false,
10
  "do_rescale": true,