Upload folder using huggingface_hub
Browse files- config.json +5 -5
- example.py +1 -1
- hf_nemotron_parse_config.py +7 -7
- hf_nemotron_parse_modeling.py +8 -8
- hf_nemotron_parse_processor.py +10 -10
- preprocessor_config.json +5 -5
config.json
CHANGED
|
@@ -1,12 +1,12 @@
|
|
| 1 |
{
|
| 2 |
"architectures": [
|
| 3 |
-
"
|
| 4 |
],
|
| 5 |
"auto_map": {
|
| 6 |
-
"AutoConfig": "hf_nemotron_parse_config.
|
| 7 |
-
"AutoModel": "hf_nemotron_parse_modeling.
|
| 8 |
-
"AutoImageProcessor": "hf_nemotron_parse_processor.
|
| 9 |
-
"AutoProcessor": "hf_nemotron_parse_processor.
|
| 10 |
},
|
| 11 |
"bos_token_id": 0,
|
| 12 |
"decoder": {
|
|
|
|
| 1 |
{
|
| 2 |
"architectures": [
|
| 3 |
+
"NemotronParseTCForConditionalGeneration"
|
| 4 |
],
|
| 5 |
"auto_map": {
|
| 6 |
+
"AutoConfig": "hf_nemotron_parse_config.NemotronParseTCConfig",
|
| 7 |
+
"AutoModel": "hf_nemotron_parse_modeling.NemotronParseTCForConditionalGeneration",
|
| 8 |
+
"AutoImageProcessor": "hf_nemotron_parse_processor.NemotronParseTCImageProcessor",
|
| 9 |
+
"AutoProcessor": "hf_nemotron_parse_processor.NemotronParseTCProcessor"
|
| 10 |
},
|
| 11 |
"bos_token_id": 0,
|
| 12 |
"decoder": {
|
example.py
CHANGED
|
@@ -4,7 +4,7 @@ from transformers import AutoModel, AutoProcessor, AutoTokenizer, AutoConfig, Au
|
|
| 4 |
from postprocessing import extract_classes_bboxes, transform_bbox_to_original, postprocess_text
|
| 5 |
|
| 6 |
# Load model and processor
|
| 7 |
-
model_path = "nvidia/NVIDIA-Nemotron-Parse-v1.1-
|
| 8 |
device = "cuda:0"
|
| 9 |
|
| 10 |
model = AutoModel.from_pretrained(
|
|
|
|
| 4 |
from postprocessing import extract_classes_bboxes, transform_bbox_to_original, postprocess_text
|
| 5 |
|
| 6 |
# Load model and processor
|
| 7 |
+
model_path = "nvidia/NVIDIA-Nemotron-Parse-v1.1-TC" # Or use a local path
|
| 8 |
device = "cuda:0"
|
| 9 |
|
| 10 |
model = AutoModel.from_pretrained(
|
hf_nemotron_parse_config.py
CHANGED
|
@@ -5,9 +5,9 @@ from typing import List, Optional
|
|
| 5 |
|
| 6 |
from transformers.dynamic_module_utils import get_class_from_dynamic_module
|
| 7 |
|
| 8 |
-
class
|
| 9 |
"""
|
| 10 |
-
Configuration class for
|
| 11 |
"""
|
| 12 |
model_type = "nemotron_parse_text"
|
| 13 |
|
|
@@ -69,12 +69,12 @@ class NemotronParseLightTextConfig(PretrainedConfig):
|
|
| 69 |
self.max_sequence_length = max_sequence_length
|
| 70 |
|
| 71 |
|
| 72 |
-
class
|
| 73 |
"""
|
| 74 |
-
Configuration class for
|
| 75 |
|
| 76 |
-
This configuration class is used to store the configuration of a [`
|
| 77 |
-
It is used to instantiate an
|
| 78 |
"""
|
| 79 |
model_type = "nemotron_parse"
|
| 80 |
is_composition = True
|
|
@@ -116,7 +116,7 @@ class NemotronParseLightConfig(PretrainedConfig):
|
|
| 116 |
self.encoder = PretrainedConfig()
|
| 117 |
|
| 118 |
decoder["max_sequence_length"] = max_sequence_length
|
| 119 |
-
self.decoder =
|
| 120 |
self.image_size = image_size
|
| 121 |
|
| 122 |
# Initialize vocab size from text config
|
|
|
|
| 5 |
|
| 6 |
from transformers.dynamic_module_utils import get_class_from_dynamic_module
|
| 7 |
|
| 8 |
+
class NemotronParseTCTextConfig(PretrainedConfig):
|
| 9 |
"""
|
| 10 |
+
Configuration class for NemotronParseTC text decoder (mBART-based).
|
| 11 |
"""
|
| 12 |
model_type = "nemotron_parse_text"
|
| 13 |
|
|
|
|
| 69 |
self.max_sequence_length = max_sequence_length
|
| 70 |
|
| 71 |
|
| 72 |
+
class NemotronParseTCConfig(PretrainedConfig):
|
| 73 |
"""
|
| 74 |
+
Configuration class for NemotronParseTC model.
|
| 75 |
|
| 76 |
+
This configuration class is used to store the configuration of a [`NemotronParseTCForConditionalGeneration`] model.
|
| 77 |
+
It is used to instantiate an NemotronParseTC model according to the specified arguments, defining the vision and text model configs.
|
| 78 |
"""
|
| 79 |
model_type = "nemotron_parse"
|
| 80 |
is_composition = True
|
|
|
|
| 116 |
self.encoder = PretrainedConfig()
|
| 117 |
|
| 118 |
decoder["max_sequence_length"] = max_sequence_length
|
| 119 |
+
self.decoder = NemotronParseTCTextConfig(**decoder)
|
| 120 |
self.image_size = image_size
|
| 121 |
|
| 122 |
# Initialize vocab size from text config
|
hf_nemotron_parse_modeling.py
CHANGED
|
@@ -13,7 +13,7 @@ from typing import Optional, List, Union, Tuple
|
|
| 13 |
import warnings
|
| 14 |
from transformers.modeling_outputs import BaseModelOutput
|
| 15 |
from transformers.models.encoder_decoder.modeling_encoder_decoder import shift_tokens_right
|
| 16 |
-
from .hf_nemotron_parse_config import
|
| 17 |
from transformers import AutoModel
|
| 18 |
import time
|
| 19 |
from transformers.modeling_attn_mask_utils import (
|
|
@@ -56,7 +56,7 @@ def pixel_shuffle(x, scale_factor=0.5, version=2):
|
|
| 56 |
|
| 57 |
return x
|
| 58 |
|
| 59 |
-
class
|
| 60 |
"""
|
| 61 |
Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`MBartDecoderLayer`]
|
| 62 |
|
|
@@ -376,11 +376,11 @@ class RadioWithNeck(nn.Module):
|
|
| 376 |
return DonutSwinModelOutput(last_hidden_state=output)
|
| 377 |
|
| 378 |
|
| 379 |
-
class
|
| 380 |
"""
|
| 381 |
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models.
|
| 382 |
"""
|
| 383 |
-
config_class =
|
| 384 |
base_model_prefix = "vision_encoder_decoder" # Use VisionEncoderDecoder prefix
|
| 385 |
main_input_name = "pixel_values"
|
| 386 |
supports_gradient_checkpointing = True
|
|
@@ -399,21 +399,21 @@ class NemotronParseLightPreTrainedModel(PreTrainedModel):
|
|
| 399 |
module.weight.data[module.padding_idx].zero_()
|
| 400 |
|
| 401 |
# Based on transformers.models.encoder_decoder.modeling_encoder_decoder
|
| 402 |
-
class
|
| 403 |
"""
|
| 404 |
-
|
| 405 |
|
| 406 |
This model combines a RADIO-based vision encoder with an mBART-based text decoder.
|
| 407 |
"""
|
| 408 |
|
| 409 |
-
def __init__(self, config:
|
| 410 |
super().__init__(config)
|
| 411 |
|
| 412 |
self.encoder = RadioWithNeck(config.encoder)
|
| 413 |
self.encoder.main_input_name = 'pixel_values'
|
| 414 |
self.encoder = self.encoder.to(config.encoder.torch_dtype)
|
| 415 |
|
| 416 |
-
self.decoder =
|
| 417 |
self.decoder = self.decoder.to(config.decoder.torch_dtype)
|
| 418 |
|
| 419 |
self.lm_head = nn.Linear(config.decoder.d_model, config.decoder.vocab_size, bias=False, dtype=config.decoder.torch_dtype)
|
|
|
|
| 13 |
import warnings
|
| 14 |
from transformers.modeling_outputs import BaseModelOutput
|
| 15 |
from transformers.models.encoder_decoder.modeling_encoder_decoder import shift_tokens_right
|
| 16 |
+
from .hf_nemotron_parse_config import NemotronParseTCConfig
|
| 17 |
from transformers import AutoModel
|
| 18 |
import time
|
| 19 |
from transformers.modeling_attn_mask_utils import (
|
|
|
|
| 56 |
|
| 57 |
return x
|
| 58 |
|
| 59 |
+
class NemotronParseTCDecoder(MBartPreTrainedModel):
|
| 60 |
"""
|
| 61 |
Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`MBartDecoderLayer`]
|
| 62 |
|
|
|
|
| 376 |
return DonutSwinModelOutput(last_hidden_state=output)
|
| 377 |
|
| 378 |
|
| 379 |
+
class NemotronParseTCPreTrainedModel(PreTrainedModel):
|
| 380 |
"""
|
| 381 |
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models.
|
| 382 |
"""
|
| 383 |
+
config_class = NemotronParseTCConfig
|
| 384 |
base_model_prefix = "vision_encoder_decoder" # Use VisionEncoderDecoder prefix
|
| 385 |
main_input_name = "pixel_values"
|
| 386 |
supports_gradient_checkpointing = True
|
|
|
|
| 399 |
module.weight.data[module.padding_idx].zero_()
|
| 400 |
|
| 401 |
# Based on transformers.models.encoder_decoder.modeling_encoder_decoder
|
| 402 |
+
class NemotronParseTCForConditionalGeneration(NemotronParseTCPreTrainedModel, GenerationMixin):
|
| 403 |
"""
|
| 404 |
+
NemotronParseTC model for conditional generation tasks.
|
| 405 |
|
| 406 |
This model combines a RADIO-based vision encoder with an mBART-based text decoder.
|
| 407 |
"""
|
| 408 |
|
| 409 |
+
def __init__(self, config: NemotronParseTCConfig):
|
| 410 |
super().__init__(config)
|
| 411 |
|
| 412 |
self.encoder = RadioWithNeck(config.encoder)
|
| 413 |
self.encoder.main_input_name = 'pixel_values'
|
| 414 |
self.encoder = self.encoder.to(config.encoder.torch_dtype)
|
| 415 |
|
| 416 |
+
self.decoder = NemotronParseTCDecoder(config.decoder)
|
| 417 |
self.decoder = self.decoder.to(config.decoder.torch_dtype)
|
| 418 |
|
| 419 |
self.lm_head = nn.Linear(config.decoder.d_model, config.decoder.vocab_size, bias=False, dtype=config.decoder.torch_dtype)
|
hf_nemotron_parse_processor.py
CHANGED
|
@@ -13,9 +13,9 @@ from transformers.image_utils import ChannelDimension, ImageInput, PILImageResam
|
|
| 13 |
from transformers.utils import TensorType
|
| 14 |
|
| 15 |
|
| 16 |
-
class
|
| 17 |
"""
|
| 18 |
-
Image processor for
|
| 19 |
|
| 20 |
This processor inherits from BaseImageProcessor to be compatible with transformers AutoImageProcessor.
|
| 21 |
"""
|
|
@@ -118,9 +118,9 @@ class NemotronParseLightImageProcessor(BaseImageProcessor, ImageProcessingMixin)
|
|
| 118 |
|
| 119 |
# Save preprocessor config in standard HuggingFace format
|
| 120 |
config = {
|
| 121 |
-
"feature_extractor_type": "
|
| 122 |
-
"image_processor_type": "
|
| 123 |
-
"processor_class": "
|
| 124 |
"size": {
|
| 125 |
"height": self.final_size[0],
|
| 126 |
"width": self.final_size[1],
|
|
@@ -189,7 +189,7 @@ class NemotronParseLightImageProcessor(BaseImageProcessor, ImageProcessingMixin)
|
|
| 189 |
**kwargs,
|
| 190 |
) -> Dict[str, torch.Tensor]:
|
| 191 |
"""
|
| 192 |
-
Preprocess an image or batch of images for the
|
| 193 |
|
| 194 |
Args:
|
| 195 |
images: Input image(s)
|
|
@@ -211,7 +211,7 @@ class NemotronParseLightImageProcessor(BaseImageProcessor, ImageProcessingMixin)
|
|
| 211 |
image = np.asarray(image)
|
| 212 |
processed_images.append(image)
|
| 213 |
|
| 214 |
-
# Apply
|
| 215 |
pixel_values = []
|
| 216 |
for image in processed_images:
|
| 217 |
processed_image = self._resize_with_aspect_ratio(image)
|
|
@@ -253,7 +253,7 @@ class NemotronParseLightImageProcessor(BaseImageProcessor, ImageProcessingMixin)
|
|
| 253 |
return self.preprocess(images, **kwargs)
|
| 254 |
|
| 255 |
|
| 256 |
-
class
|
| 257 |
|
| 258 |
attributes = ["image_processor", "tokenizer"]
|
| 259 |
image_processor_class = "AutoImageProcessor"
|
|
@@ -261,7 +261,7 @@ class NemotronParseLightProcessor(ProcessorMixin):
|
|
| 261 |
|
| 262 |
def __init__(self, image_processor=None, tokenizer=None, **kwargs):
|
| 263 |
if image_processor is None:
|
| 264 |
-
image_processor =
|
| 265 |
|
| 266 |
super().__init__(image_processor, tokenizer)
|
| 267 |
|
|
@@ -392,5 +392,5 @@ class NemotronParseLightProcessor(ProcessorMixin):
|
|
| 392 |
|
| 393 |
# Use the parent class's save_pretrained method for processor config
|
| 394 |
super().save_pretrained(save_directory, **kwargs)
|
| 395 |
-
print(f"
|
| 396 |
print(f"AutoTokenizer.from_pretrained('{save_directory}') should now work!")
|
|
|
|
| 13 |
from transformers.utils import TensorType
|
| 14 |
|
| 15 |
|
| 16 |
+
class NemotronParseTCImageProcessor(BaseImageProcessor, ImageProcessingMixin):
|
| 17 |
"""
|
| 18 |
+
Image processor for NemotronParseTC model.
|
| 19 |
|
| 20 |
This processor inherits from BaseImageProcessor to be compatible with transformers AutoImageProcessor.
|
| 21 |
"""
|
|
|
|
| 118 |
|
| 119 |
# Save preprocessor config in standard HuggingFace format
|
| 120 |
config = {
|
| 121 |
+
"feature_extractor_type": "NemotronParseTCImageProcessor",
|
| 122 |
+
"image_processor_type": "NemotronParseTCImageProcessor",
|
| 123 |
+
"processor_class": "NemotronParseTCImageProcessor",
|
| 124 |
"size": {
|
| 125 |
"height": self.final_size[0],
|
| 126 |
"width": self.final_size[1],
|
|
|
|
| 189 |
**kwargs,
|
| 190 |
) -> Dict[str, torch.Tensor]:
|
| 191 |
"""
|
| 192 |
+
Preprocess an image or batch of images for the NemotronParseTC model.
|
| 193 |
|
| 194 |
Args:
|
| 195 |
images: Input image(s)
|
|
|
|
| 211 |
image = np.asarray(image)
|
| 212 |
processed_images.append(image)
|
| 213 |
|
| 214 |
+
# Apply NemotronParseTC-specific transforms
|
| 215 |
pixel_values = []
|
| 216 |
for image in processed_images:
|
| 217 |
processed_image = self._resize_with_aspect_ratio(image)
|
|
|
|
| 253 |
return self.preprocess(images, **kwargs)
|
| 254 |
|
| 255 |
|
| 256 |
+
class NemotronParseTCProcessor(ProcessorMixin):
|
| 257 |
|
| 258 |
attributes = ["image_processor", "tokenizer"]
|
| 259 |
image_processor_class = "AutoImageProcessor"
|
|
|
|
| 261 |
|
| 262 |
def __init__(self, image_processor=None, tokenizer=None, **kwargs):
|
| 263 |
if image_processor is None:
|
| 264 |
+
image_processor = NemotronParseTCImageProcessor(**kwargs)
|
| 265 |
|
| 266 |
super().__init__(image_processor, tokenizer)
|
| 267 |
|
|
|
|
| 392 |
|
| 393 |
# Use the parent class's save_pretrained method for processor config
|
| 394 |
super().save_pretrained(save_directory, **kwargs)
|
| 395 |
+
print(f"NemotronParseTCProcessor saved to {save_directory}")
|
| 396 |
print(f"AutoTokenizer.from_pretrained('{save_directory}') should now work!")
|
preprocessor_config.json
CHANGED
|
@@ -1,10 +1,10 @@
|
|
| 1 |
{
|
| 2 |
-
"feature_extractor_type": "
|
| 3 |
-
"image_processor_type": "
|
| 4 |
-
"processor_class": "
|
| 5 |
"auto_map": {
|
| 6 |
-
"AutoImageProcessor": "hf_nemotron_parse_processor.
|
| 7 |
-
"AutoProcessor": "hf_nemotron_parse_processor.
|
| 8 |
},
|
| 9 |
"do_normalize": false,
|
| 10 |
"do_rescale": true,
|
|
|
|
| 1 |
{
|
| 2 |
+
"feature_extractor_type": "NemotronParseTCImageProcessor",
|
| 3 |
+
"image_processor_type": "NemotronParseTCImageProcessor",
|
| 4 |
+
"processor_class": "NemotronParseTCProcessor",
|
| 5 |
"auto_map": {
|
| 6 |
+
"AutoImageProcessor": "hf_nemotron_parse_processor.NemotronParseTCImageProcessor",
|
| 7 |
+
"AutoProcessor": "hf_nemotron_parse_processor.NemotronParseTCProcessor"
|
| 8 |
},
|
| 9 |
"do_normalize": false,
|
| 10 |
"do_rescale": true,
|