katerynaCh commited on
Commit
58ce2eb
·
verified ·
1 Parent(s): ddf5f62

Upload folder using huggingface_hub

Browse files
config.json ADDED
@@ -0,0 +1,402 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "NemotronParseLightForConditionalGeneration"
4
+ ],
5
+ "auto_map": {
6
+ "AutoConfig": "hf_nemotron_parse_config.NemotronParseLightConfig",
7
+ "AutoModel": "hf_nemotron_parse_modeling.NemotronParseLightForConditionalGeneration",
8
+ "AutoImageProcessor": "hf_nemotron_parse_processor.NemotronParseLightImageProcessor",
9
+ "AutoProcessor": "hf_nemotron_parse_processor.NemotronParseLightProcessor"
10
+ },
11
+ "bos_token_id": 0,
12
+ "decoder": {
13
+ "_attn_implementation": "sdpa",
14
+ "_name_or_path": "",
15
+ "activation_dropout": 0.0,
16
+ "activation_function": "gelu",
17
+ "add_cross_attention": true,
18
+ "add_final_layer_norm": true,
19
+ "architectures": null,
20
+ "attention_dropout": 0.0,
21
+ "bad_words_ids": null,
22
+ "begin_suppress_tokens": null,
23
+ "bos_token_id": 0,
24
+ "chunk_size_feed_forward": 0,
25
+ "classifier_dropout": 0.0,
26
+ "cross_attention_hidden_size": null,
27
+ "d_model": 1024,
28
+ "decoder_attention_heads": 16,
29
+ "decoder_ffn_dim": 4096,
30
+ "decoder_layerdrop": 0.0,
31
+ "decoder_layers": 10,
32
+ "decoder_start_token_id": null,
33
+ "diversity_penalty": 0.0,
34
+ "do_sample": false,
35
+ "dropout": 0.1,
36
+ "early_stopping": false,
37
+ "encoder_attention_heads": 16,
38
+ "encoder_ffn_dim": 4096,
39
+ "encoder_layerdrop": 0.0,
40
+ "encoder_layers": 12,
41
+ "encoder_no_repeat_ngram_size": 0,
42
+ "eos_token_id": 2,
43
+ "exponential_decay_length_penalty": null,
44
+ "finetuning_task": null,
45
+ "forced_bos_token_id": null,
46
+ "forced_eos_token_id": 2,
47
+ "hidden_size": 1024,
48
+ "id2label": {
49
+ "0": "LABEL_0",
50
+ "1": "LABEL_1",
51
+ "2": "LABEL_2"
52
+ },
53
+ "init_std": 0.02,
54
+ "is_decoder": true,
55
+ "is_encoder_decoder": false,
56
+ "label2id": {
57
+ "LABEL_0": 0,
58
+ "LABEL_1": 1,
59
+ "LABEL_2": 2
60
+ },
61
+ "length_penalty": 1.0,
62
+ "max_length": 20,
63
+ "min_length": 0,
64
+ "model_type": "nemotron_parse_text",
65
+ "no_repeat_ngram_size": 0,
66
+ "num_beam_groups": 1,
67
+ "num_beams": 1,
68
+ "num_hidden_layers": 12,
69
+ "num_return_sequences": 1,
70
+ "output_attentions": false,
71
+ "output_hidden_states": false,
72
+ "output_scores": false,
73
+ "pad_token_id": 1,
74
+ "prefix": null,
75
+ "problem_type": null,
76
+ "pruned_heads": {},
77
+ "remove_invalid_values": false,
78
+ "repetition_penalty": 1.0,
79
+ "return_dict": true,
80
+ "return_dict_in_generate": false,
81
+ "scale_embedding": true,
82
+ "sep_token_id": null,
83
+ "suppress_tokens": null,
84
+ "task_specific_params": null,
85
+ "temperature": 1.0,
86
+ "tf_legacy_loss": false,
87
+ "tie_encoder_decoder": false,
88
+ "tie_word_embeddings": false,
89
+ "tokenizer_class": null,
90
+ "top_k": 50,
91
+ "top_p": 1.0,
92
+ "torch_dtype": "bfloat16",
93
+ "torchscript": false,
94
+ "transformers_version": "4.51.3",
95
+ "typical_p": 1.0,
96
+ "use_bfloat16": true,
97
+ "use_cache": true,
98
+ "vocab_size": 52352
99
+ },
100
+ "decoder_start_token_id": 2,
101
+ "encoder": {
102
+ "_attn_implementation": "eager",
103
+ "_name_or_path": "nvidia/C-RADIOv2-H",
104
+ "adaptor_configs": {},
105
+ "adaptor_names": null,
106
+ "add_cross_attention": false,
107
+ "architectures": [
108
+ "RADIOModel"
109
+ ],
110
+ "args": {
111
+ "aa": null,
112
+ "amp": true,
113
+ "amp_dtype": "bfloat16",
114
+ "amp_impl": "native",
115
+ "aug_repeats": 0,
116
+ "aug_splits": 0,
117
+ "bn_eps": null,
118
+ "bn_momentum": null,
119
+ "cache_dir": null,
120
+ "channels_last": false,
121
+ "checkpoint_hist": 10,
122
+ "chk_keep_forever": 100,
123
+ "class_map": "",
124
+ "clip_grad": null,
125
+ "clip_mode": "norm",
126
+ "cls_token_per_teacher": true,
127
+ "coco_annotations_file": "/datasets/coco2017-adlsa/annotations/captions_val2017.json",
128
+ "coco_image_dir": "/datasets/coco2017-adlsa/val2017",
129
+ "color_jitter": 0.4,
130
+ "cooldown_epochs": 0,
131
+ "cpe_max_size": 2048,
132
+ "crd_loss": false,
133
+ "crd_loss_weight": 0.8,
134
+ "crop_pct": null,
135
+ "cutmix": 0.0,
136
+ "cutmix_minmax": null,
137
+ "dataset_download": false,
138
+ "debug_full_knn": false,
139
+ "decay_epochs": 90,
140
+ "decay_milestones": [
141
+ 90,
142
+ 180,
143
+ 270
144
+ ],
145
+ "decay_rate": 0.1,
146
+ "depchain": true,
147
+ "dist_bn": "reduce",
148
+ "dist_norm_weight": 0.0,
149
+ "distributed": true,
150
+ "drop": 0.0,
151
+ "drop_block": null,
152
+ "drop_connect": null,
153
+ "drop_path": null,
154
+ "dtype": "bfloat16",
155
+ "epoch_repeats": 0.0,
156
+ "eval": false,
157
+ "eval_metric": "knn_top1",
158
+ "eval_teacher": false,
159
+ "eval_teacher_only": false,
160
+ "eval_throughput": false,
161
+ "fast_norm": false,
162
+ "fd_loss_fn": "MSE",
163
+ "feature_normalization": "SHIP_NORM",
164
+ "feature_summarizer": "cls_token",
165
+ "feature_upscale_factor": null,
166
+ "force_new_wandb_id": false,
167
+ "force_spectral_reparam": true,
168
+ "freeze_bn": false,
169
+ "fsdp": false,
170
+ "fuser": "",
171
+ "gp": null,
172
+ "grad_accum_steps": 1,
173
+ "grad_checkpointing": false,
174
+ "head_init_bias": null,
175
+ "head_init_scale": null,
176
+ "head_warmup": 5,
177
+ "head_weight_decay": 0.001,
178
+ "hflip": 0.5,
179
+ "img_size": null,
180
+ "in_chans": null,
181
+ "initial_checkpoint": null,
182
+ "input_size": null,
183
+ "interpolation": "",
184
+ "layer_decay": null,
185
+ "local_rank": 0,
186
+ "log_interval": 50,
187
+ "log_mlflow": false,
188
+ "log_wandb": true,
189
+ "loss_auto_balance": false,
190
+ "lr_base": 0.1,
191
+ "lr_base_scale": "",
192
+ "lr_base_size": 256,
193
+ "lr_cycle_decay": 0.5,
194
+ "lr_cycle_limit": 1,
195
+ "lr_cycle_mul": 1.0,
196
+ "lr_k_decay": 1.0,
197
+ "lr_noise": null,
198
+ "lr_noise_pct": 0.67,
199
+ "lr_noise_std": 1.0,
200
+ "mean": null,
201
+ "mesa": false,
202
+ "min_lr": 0,
203
+ "mixup": 0.0,
204
+ "mixup_mode": "batch",
205
+ "mixup_off_epoch": 0,
206
+ "mixup_prob": 1.0,
207
+ "mixup_switch_prob": 0.5,
208
+ "mlp_hidden_size": 1520,
209
+ "mlp_num_inner": 3,
210
+ "mlp_version": "v2",
211
+ "model": "vit_huge_patch16_224",
212
+ "model_kwargs": {},
213
+ "model_norm": false,
214
+ "momentum": 0.9,
215
+ "no_aug": false,
216
+ "no_ddp_bb": true,
217
+ "no_prefetcher": false,
218
+ "no_resume_opt": false,
219
+ "num_classes": null,
220
+ "opt_betas": null,
221
+ "opt_eps": null,
222
+ "patience_epochs": 10,
223
+ "pin_mem": false,
224
+ "prefetcher": true,
225
+ "pretrained": false,
226
+ "rank": 0,
227
+ "ratio": [
228
+ 0.75,
229
+ 1.3333333333333333
230
+ ],
231
+ "recount": 1,
232
+ "recovery_interval": 0,
233
+ "register_multiple": 8,
234
+ "remode": "pixel",
235
+ "reprob": 0.0,
236
+ "reset_loss_state": false,
237
+ "resplit": false,
238
+ "save_images": false,
239
+ "scale": [
240
+ 0.5,
241
+ 1.0
242
+ ],
243
+ "sched": "cosine",
244
+ "seed": 42,
245
+ "smoothing": 0.1,
246
+ "spectral_heads": false,
247
+ "spectral_reparam": false,
248
+ "split_bn": false,
249
+ "start_epoch": null,
250
+ "std": null,
251
+ "stream_teachers": true,
252
+ "sync_bn": false,
253
+ "synchronize_step": false,
254
+ "teachers": [
255
+ {
256
+ "fd_normalize": false,
257
+ "feature_distillation": true,
258
+ "input_size": 378,
259
+ "model": "ViT-H-14-378-quickgelu",
260
+ "name": "clip",
261
+ "pretrained": "dfn5b",
262
+ "type": "open_clip",
263
+ "use_summary": true
264
+ },
265
+ {
266
+ "fd_normalize": false,
267
+ "feature_distillation": true,
268
+ "input_size": 378,
269
+ "model": "ViT-SO400M-14-SigLIP-384",
270
+ "name": "siglip",
271
+ "pretrained": "webli",
272
+ "type": "open_clip",
273
+ "use_summary": true
274
+ },
275
+ {
276
+ "fd_normalize": false,
277
+ "feature_distillation": true,
278
+ "input_size": 378,
279
+ "model": "dinov2_vitg14_reg",
280
+ "name": "dino_v2",
281
+ "type": "dino_v2",
282
+ "use_summary": true
283
+ },
284
+ {
285
+ "fd_normalize": false,
286
+ "feature_distillation": true,
287
+ "input_size": 1024,
288
+ "model": "vit-h",
289
+ "name": "sam",
290
+ "type": "sam",
291
+ "use_summary": false
292
+ }
293
+ ],
294
+ "torchcompile": null,
295
+ "torchscript": false,
296
+ "train_interpolation": "random",
297
+ "train_split": "train",
298
+ "tta": 0,
299
+ "use_coco": false,
300
+ "use_multi_epochs_loader": false,
301
+ "val_ema_only": false,
302
+ "val_split": "val",
303
+ "vflip": 0.0,
304
+ "vitdet_version": 1,
305
+ "wandb_entity": "",
306
+ "wandb_job_type": "",
307
+ "wandb_name": "",
308
+ "wandb_project": "",
309
+ "warmup_lr": 1e-05,
310
+ "warmup_prefix": false,
311
+ "worker_seeding": "all",
312
+ "workers": 8,
313
+ "world_size": 256
314
+ },
315
+ "auto_map": {
316
+ "AutoConfig": "nvidia/C-RADIOv2-H--hf_model.RADIOConfig",
317
+ "AutoModel": "nvidia/C-RADIOv2-H--hf_model.RADIOModel"
318
+ },
319
+ "bad_words_ids": null,
320
+ "begin_suppress_tokens": null,
321
+ "bos_token_id": null,
322
+ "chunk_size_feed_forward": 0,
323
+ "cross_attention_hidden_size": null,
324
+ "decoder_start_token_id": null,
325
+ "diversity_penalty": 0.0,
326
+ "do_sample": false,
327
+ "early_stopping": false,
328
+ "encoder_no_repeat_ngram_size": 0,
329
+ "eos_token_id": null,
330
+ "exponential_decay_length_penalty": null,
331
+ "feature_normalizer_config": null,
332
+ "finetuning_task": null,
333
+ "forced_bos_token_id": null,
334
+ "forced_eos_token_id": null,
335
+ "id2label": {
336
+ "0": "LABEL_0",
337
+ "1": "LABEL_1"
338
+ },
339
+ "inter_feature_normalizer_config": null,
340
+ "is_decoder": false,
341
+ "is_encoder_decoder": false,
342
+ "label2id": {
343
+ "LABEL_0": 0,
344
+ "LABEL_1": 1
345
+ },
346
+ "length_penalty": 1.0,
347
+ "max_length": 20,
348
+ "max_resolution": 2048,
349
+ "min_length": 0,
350
+ "model_type": "",
351
+ "no_repeat_ngram_size": 0,
352
+ "num_beam_groups": 1,
353
+ "num_beams": 1,
354
+ "num_return_sequences": 1,
355
+ "output_attentions": false,
356
+ "output_hidden_states": false,
357
+ "output_scores": false,
358
+ "pad_token_id": null,
359
+ "patch_size": 16,
360
+ "preferred_resolution": [
361
+ 768,
362
+ 768
363
+ ],
364
+ "prefix": null,
365
+ "problem_type": null,
366
+ "pruned_heads": {},
367
+ "remove_invalid_values": false,
368
+ "repetition_penalty": 1.0,
369
+ "return_dict": true,
370
+ "return_dict_in_generate": false,
371
+ "sep_token_id": null,
372
+ "suppress_tokens": null,
373
+ "task_specific_params": null,
374
+ "temperature": 1.0,
375
+ "tf_legacy_loss": false,
376
+ "tie_encoder_decoder": false,
377
+ "tie_word_embeddings": true,
378
+ "tokenizer_class": null,
379
+ "top_k": 50,
380
+ "top_p": 1.0,
381
+ "torch_dtype": "bfloat16",
382
+ "torchscript": false,
383
+ "transformers_version": "4.51.3",
384
+ "typical_p": 1.0,
385
+ "use_bfloat16": true,
386
+ "version": "radio_v2.5-h",
387
+ "vitdet_window_size": null
388
+ },
389
+ "eos_token_id": 2,
390
+ "image_size": [
391
+ 2048,
392
+ 1664
393
+ ],
394
+ "is_encoder_decoder": true,
395
+ "max_sequence_length": 9000,
396
+ "model_type": "nemotron_parse",
397
+ "pad_token_id": 1,
398
+ "tie_word_embeddings": false,
399
+ "torch_dtype": "bfloat16",
400
+ "transformers_version": "4.51.3",
401
+ "vocab_size": 52327
402
+ }
generation_config.json ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 0,
4
+ "decoder_start_token_id": 2,
5
+ "eos_token_id": 2,
6
+ "forced_eos_token_id": 2,
7
+ "pad_token_id": 1,
8
+ "max_new_tokens": 9000,
9
+ "do_sample": false,
10
+ "num_beams": 1,
11
+ "repetition_penalty": 1.1,
12
+ "transformers_version": "4.51.3"
13
+ }
hf_nemotron_parse_config.py ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from os import truncate
2
+ from quopri import decodestring
3
+ from transformers import PretrainedConfig
4
+ from typing import List, Optional
5
+
6
+ from transformers.dynamic_module_utils import get_class_from_dynamic_module
7
+
8
+ class NemotronParseLightTextConfig(PretrainedConfig):
9
+ """
10
+ Configuration class for NemotronParseLight text decoder (mBART-based).
11
+ """
12
+ model_type = "nemotron_parse_text"
13
+
14
+ def __init__(
15
+ self,
16
+ vocab_size: int = 250027,
17
+ d_model: int = 1024,
18
+ encoder_layers: int = 12,
19
+ decoder_layers: int = 12,
20
+ encoder_attention_heads: int = 16,
21
+ decoder_attention_heads: int = 16,
22
+ decoder_ffn_dim: int = 4096,
23
+ encoder_ffn_dim: int = 4096,
24
+ activation_function: str = "gelu",
25
+ dropout: float = 0.1,
26
+ attention_dropout: float = 0.0,
27
+ activation_dropout: float = 0.0,
28
+ classifier_dropout: float = 0.0,
29
+ init_std: float = 0.02,
30
+ encoder_layerdrop: float = 0.0,
31
+ decoder_layerdrop: float = 0.0,
32
+ scale_embedding: bool = False,
33
+ use_cache: bool = True,
34
+ num_labels: int = 3,
35
+ forced_eos_token_id: int = 2,
36
+ add_cross_attention: bool = True, # Enable cross-attention for vision-encoder-decoder
37
+ is_decoder: bool = True, # This is a decoder
38
+ max_sequence_length: int = 9000,
39
+ **kwargs
40
+ ):
41
+ super().__init__(**kwargs)
42
+ self.vocab_size = vocab_size
43
+ self.d_model = d_model
44
+ self.encoder_layers = encoder_layers
45
+ self.decoder_layers = decoder_layers
46
+ self.encoder_attention_heads = encoder_attention_heads
47
+ self.decoder_attention_heads = decoder_attention_heads
48
+ self.decoder_ffn_dim = decoder_ffn_dim
49
+ self.encoder_ffn_dim = encoder_ffn_dim
50
+ self.activation_function = activation_function
51
+ self.dropout = dropout
52
+ self.attention_dropout = attention_dropout
53
+ self.activation_dropout = activation_dropout
54
+ self.classifier_dropout = classifier_dropout
55
+ self.init_std = init_std
56
+ self.encoder_layerdrop = encoder_layerdrop
57
+ self.decoder_layerdrop = decoder_layerdrop
58
+ self.scale_embedding = scale_embedding
59
+ self.use_cache = use_cache
60
+ self.num_labels = num_labels
61
+ self.add_cross_attention = add_cross_attention
62
+ self.is_decoder = is_decoder
63
+
64
+ # Add hidden_size as alias for d_model (for compatibility)
65
+ self.hidden_size = self.d_model
66
+ self.forced_eos_token_id = forced_eos_token_id
67
+ self.num_attention_heads = self.encoder_attention_heads
68
+
69
+ self.max_sequence_length = max_sequence_length
70
+
71
+
72
+ class NemotronParseLightConfig(PretrainedConfig):
73
+ """
74
+ Configuration class for NemotronParseLight model.
75
+
76
+ This configuration class is used to store the configuration of a [`NemotronParseLightForConditionalGeneration`] model.
77
+ It is used to instantiate an NemotronParseLight model according to the specified arguments, defining the vision and text model configs.
78
+ """
79
+ model_type = "nemotron_parse"
80
+ is_composition = True
81
+ max_sequence_length = 9000
82
+
83
+ def __init__(
84
+ self,
85
+ encoder: Optional[dict] = None,
86
+ decoder: Optional[dict] = None,
87
+ tie_word_embeddings: bool = False,
88
+ decoder_start_token_id: int = 2,
89
+ pad_token_id: int = 1,
90
+ eos_token_id: int = 2,
91
+ bos_token_id: int = 0,
92
+ image_size: List[int] = [2048, 1664],
93
+ is_encoder_decoder: bool = True,
94
+ max_sequence_length: int = 9000,
95
+ **kwargs
96
+ ):
97
+ super().__init__(
98
+ tie_word_embeddings=tie_word_embeddings,
99
+ decoder_start_token_id=decoder_start_token_id,
100
+ pad_token_id=pad_token_id,
101
+ eos_token_id=eos_token_id,
102
+ bos_token_id=bos_token_id,
103
+ max_sequence_length=max_sequence_length,
104
+ **kwargs
105
+ )
106
+
107
+
108
+ if decoder is None:
109
+ decoder = {}
110
+
111
+ if encoder is not None:
112
+ assert "auto_map" in encoder and "AutoConfig" in encoder["auto_map"]
113
+ vision_auto_config = get_class_from_dynamic_module(*encoder["auto_map"]["AutoConfig"].split("--")[::-1])
114
+ self.encoder = vision_auto_config(**encoder)
115
+ else:
116
+ self.encoder = PretrainedConfig()
117
+
118
+ decoder["max_sequence_length"] = max_sequence_length
119
+ self.decoder = NemotronParseLightTextConfig(**decoder)
120
+ self.image_size = image_size
121
+
122
+ # Initialize vocab size from text config
123
+ self.vocab_size = self.decoder.vocab_size
124
+ self.is_encoder_decoder = is_encoder_decoder
125
+ self.max_sequence_length = max_sequence_length
126
+
127
+ def to_dict(self):
128
+ """
129
+ Serializes this instance to a Python dictionary. Override the default [`~PretrainedConfig.to_dict`].
130
+ """
131
+ output = super().to_dict()
132
+ output["encoder"] = self.encoder.to_dict()
133
+ output["decoder"] = self.decoder.to_dict()
134
+ output["model_type"] = self.model_type
135
+ output["is_encoder_decoder"] = self.is_encoder_decoder
136
+ return output
hf_nemotron_parse_modeling.py ADDED
@@ -0,0 +1,619 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import torch
3
+ import torch.nn as nn
4
+ from torch.nn import CrossEntropyLoss
5
+ from transformers import PreTrainedModel, GenerationMixin
6
+ from transformers.models.vision_encoder_decoder.modeling_vision_encoder_decoder import VisionEncoderDecoderModel
7
+ from transformers.models.vision_encoder_decoder.configuration_vision_encoder_decoder import VisionEncoderDecoderConfig
8
+ from transformers.modeling_outputs import Seq2SeqLMOutput
9
+ from transformers.models.mbart.modeling_mbart import MBartPreTrainedModel, MBartConfig, MBartScaledWordEmbedding, MBartDecoderLayer, BaseModelOutputWithPastAndCrossAttentions
10
+ from transformers.models.donut.modeling_donut_swin import DonutSwinModelOutput
11
+ from einops import rearrange
12
+ from typing import Optional, List, Union, Tuple
13
+ import warnings
14
+ from transformers.modeling_outputs import BaseModelOutput
15
+ from transformers.models.encoder_decoder.modeling_encoder_decoder import shift_tokens_right
16
+ from hf_nemotron_parse_config import NemotronParseLightConfig
17
+ from transformers import AutoModel
18
+ import time
19
+ from transformers.modeling_attn_mask_utils import (
20
+ _prepare_4d_attention_mask,
21
+ _prepare_4d_attention_mask_for_sdpa,
22
+ _prepare_4d_causal_attention_mask,
23
+ _prepare_4d_causal_attention_mask_for_sdpa,
24
+ )
25
+
26
+ # Based on https://github.com/OpenGVLab/InternVL/blob/c7c5af1a8930b4862afe8ed14672307082ef61fa/internvl_chat/internvl/model/internvl_chat/modeling_internvl_chat.py#L218
27
+ # Copyright (c) 2023 OpenGVLab.
28
+ def pixel_shuffle(x, scale_factor=0.5, version=2):
29
+ """Pixel shuffle based on InternVL but adapted for our use case.
30
+
31
+ Args:
32
+ x (torch.Tensor): Vision model outputs [num_tiles, img_seq_len, h_vision]
33
+ version (int): Implementation version.
34
+
35
+ Returns:
36
+ Shuffled vision model outputs [num_tiles, (sq ** 2) * (scale ** 2), h_vision / (scale ** 2)]
37
+ """
38
+ h = 128
39
+ w = 26
40
+ x = x.reshape(x.shape[0], h, w, -1) # [num_tiles, sq, sq, h_vision]
41
+ x = x.permute(0,2,1,3).contiguous()
42
+ n, w, h, c = x.size()
43
+ # N, W, H, C --> N, W, H * scale, C // scale
44
+ x = x.reshape(n, w, int(h * scale_factor), int(c / scale_factor))
45
+ # N, W, H * scale, C // scale --> N, H * scale, W, C // scale
46
+ x = x.permute(0, 2, 1, 3).contiguous()
47
+ # N, H * scale, W, C // scale --> N, H * scale, W * scale, C // (scale ** 2)
48
+ x = x.reshape(
49
+ n, int(h * scale_factor), int(w*scale_factor), int(c / (scale_factor * scale_factor))) #int(w * scale_factor), int(c / (scale_factor * scale_factor))
50
+ #)
51
+
52
+ if version == 2:
53
+ x = x.permute(0, 2, 1, 3).contiguous()
54
+
55
+ x = x.reshape(x.shape[0], -1, x.shape[-1])
56
+
57
+ return x
58
+
59
+ class NemotronParseLightDecoder(MBartPreTrainedModel):
60
+ """
61
+ Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`MBartDecoderLayer`]
62
+
63
+ Args:
64
+ config: MBartConfig
65
+ embed_tokens (nn.Embedding): output embedding
66
+ """
67
+
68
+ def __init__(self, config: MBartConfig, embed_tokens: Optional[nn.Embedding] = None):
69
+ super().__init__(config)
70
+ self.dropout = config.dropout
71
+ self.layerdrop = config.decoder_layerdrop
72
+ self.padding_idx = config.pad_token_id
73
+ embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
74
+
75
+ self.embed_tokens = MBartScaledWordEmbedding(
76
+ config.vocab_size, config.d_model, self.padding_idx, embed_scale=embed_scale
77
+ )
78
+
79
+ if embed_tokens is not None:
80
+ self.embed_tokens.weight = embed_tokens.weight
81
+
82
+ self.layers = nn.ModuleList([MBartDecoderLayer(config) for _ in range(config.decoder_layers)])
83
+ self.config = config
84
+
85
+ self.layernorm_embedding = nn.LayerNorm(config.d_model)
86
+ self.layer_norm = nn.LayerNorm(config.d_model)
87
+
88
+ self.gradient_checkpointing = False
89
+ # Initialize weights and apply final processing
90
+ self.post_init()
91
+
92
+ def get_input_embeddings(self):
93
+ return self.embed_tokens
94
+
95
+ def set_input_embeddings(self, value):
96
+ self.embed_tokens = value
97
+
98
+ def forward(
99
+ self,
100
+ input_ids: Optional[torch.LongTensor] = None,
101
+ attention_mask: Optional[torch.Tensor] = None,
102
+ encoder_hidden_states: Optional[torch.FloatTensor] = None,
103
+ encoder_attention_mask: Optional[torch.LongTensor] = None,
104
+ head_mask: Optional[torch.Tensor] = None,
105
+ cross_attn_head_mask: Optional[torch.Tensor] = None,
106
+ past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
107
+ inputs_embeds: Optional[torch.FloatTensor] = None,
108
+ use_cache: Optional[bool] = None,
109
+ output_attentions: Optional[bool] = None,
110
+ output_hidden_states: Optional[bool] = None,
111
+ return_dict: Optional[bool] = None,
112
+ ) -> Union[Tuple, BaseModelOutputWithPastAndCrossAttentions]:
113
+ r"""
114
+ Args:
115
+ input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
116
+ Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
117
+ provide it.
118
+
119
+ Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
120
+ [`PreTrainedTokenizer.__call__`] for details.
121
+
122
+ [What are input IDs?](../glossary#input-ids)
123
+ attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
124
+ Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
125
+
126
+ - 1 for tokens that are **not masked**,
127
+ - 0 for tokens that are **masked**.
128
+
129
+ [What are attention masks?](../glossary#attention-mask)
130
+ encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
131
+ Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
132
+ of the decoder.
133
+ encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
134
+ Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
135
+ selected in `[0, 1]`:
136
+
137
+ - 1 for tokens that are **not masked**,
138
+ - 0 for tokens that are **masked**.
139
+
140
+ [What are attention masks?](../glossary#attention-mask)
141
+ head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
142
+ Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
143
+
144
+ - 1 indicates the head is **not masked**,
145
+ - 0 indicates the head is **masked**.
146
+
147
+ cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
148
+ Mask to nullify selected heads of the cross-attention modules in the decoder to avoid performing
149
+ cross-attention on hidden heads. Mask values selected in `[0, 1]`:
150
+
151
+ - 1 indicates the head is **not masked**,
152
+ - 0 indicates the head is **masked**.
153
+
154
+ past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
155
+ Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
156
+ shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
157
+ shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
158
+
159
+ Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
160
+ cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
161
+
162
+ If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
163
+ that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
164
+ all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
165
+ inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
166
+ Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
167
+ This is useful if you want more control over how to convert `input_ids` indices into associated vectors
168
+ than the model's internal embedding lookup matrix.
169
+ output_attentions (`bool`, *optional*):
170
+ Whether or not to return the attentions tensors of all attention layers. See `attentions` under
171
+ returned tensors for more detail.
172
+ output_hidden_states (`bool`, *optional*):
173
+ Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
174
+ for more detail.
175
+ return_dict (`bool`, *optional*):
176
+ Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
177
+ """
178
+ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
179
+ output_hidden_states = (
180
+ output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
181
+ )
182
+ use_cache = use_cache if use_cache is not None else self.config.use_cache
183
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
184
+
185
+ # retrieve input_ids and inputs_embeds
186
+ if input_ids is not None and inputs_embeds is not None:
187
+ raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
188
+ elif input_ids is not None:
189
+ input = input_ids
190
+ input_shape = input.size()
191
+ input_ids = input_ids.view(-1, input_shape[-1])
192
+ elif inputs_embeds is not None:
193
+ input_shape = inputs_embeds.size()[:-1]
194
+ input = inputs_embeds[:, :, -1]
195
+ else:
196
+ raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
197
+
198
+ # past_key_values_length
199
+ past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
200
+
201
+ if inputs_embeds is None:
202
+ inputs_embeds = self.embed_tokens(input_ids)
203
+
204
+ if self.config._attn_implementation == "flash_attention_2":
205
+ # 2d mask is passed through the layers
206
+ attention_mask = attention_mask if (attention_mask is not None and 0 in attention_mask) else None
207
+ elif self.config._attn_implementation == "sdpa" and not output_attentions and cross_attn_head_mask is None:
208
+ # output_attentions=True & cross_attn_head_mask can not be supported when using SDPA, and we fall back on
209
+ # the manual implementation that requires a 4D causal mask in all cases.
210
+ attention_mask = _prepare_4d_causal_attention_mask_for_sdpa(
211
+ attention_mask,
212
+ input_shape,
213
+ inputs_embeds,
214
+ past_key_values_length,
215
+ )
216
+ else:
217
+ # 4d mask is passed through the layers
218
+ attention_mask = _prepare_4d_causal_attention_mask(
219
+ attention_mask, input_shape, inputs_embeds, past_key_values_length
220
+ )
221
+
222
+ # expand encoder attention mask
223
+ if encoder_hidden_states is not None and encoder_attention_mask is not None:
224
+ if self.config._attn_implementation == "flash_attention_2":
225
+ encoder_attention_mask = encoder_attention_mask if 0 in encoder_attention_mask else None
226
+ elif self.config._attn_implementation == "sdpa" and cross_attn_head_mask is None and not output_attentions:
227
+ # output_attentions=True & cross_attn_head_mask can not be supported when using SDPA, and we fall back on
228
+ # the manual implementation that requires a 4D causal mask in all cases.
229
+ # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
230
+ encoder_attention_mask = _prepare_4d_attention_mask_for_sdpa(
231
+ encoder_attention_mask,
232
+ inputs_embeds.dtype,
233
+ tgt_len=input_shape[-1],
234
+ )
235
+ else:
236
+ # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
237
+ encoder_attention_mask = _prepare_4d_attention_mask(
238
+ encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]
239
+ )
240
+ hidden_states = inputs_embeds
241
+ hidden_states = self.layernorm_embedding(hidden_states)
242
+
243
+ hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
244
+
245
+ if self.gradient_checkpointing and self.training:
246
+ if use_cache:
247
+ logger.warning_once(
248
+ "`use_cache=True` is incompatible with gradient checkpointing`. Setting `use_cache=False`..."
249
+ )
250
+ use_cache = False
251
+
252
+ # decoder layers
253
+ all_hidden_states = () if output_hidden_states else None
254
+ all_self_attns = () if output_attentions else None
255
+ all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
256
+ next_decoder_cache = () if use_cache else None
257
+
258
+ # check if head_mask/cross_attn_head_mask has a correct number of layers specified if desired
259
+ for attn_mask, mask_name in zip([head_mask, cross_attn_head_mask], ["head_mask", "cross_attn_head_mask"]):
260
+ if attn_mask is not None:
261
+ if attn_mask.size()[0] != len(self.layers):
262
+ raise ValueError(
263
+ f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for"
264
+ f" {attn_mask.size()[0]}."
265
+ )
266
+ for idx, decoder_layer in enumerate(self.layers):
267
+ # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
268
+ if output_hidden_states:
269
+ all_hidden_states += (hidden_states,)
270
+ if self.training:
271
+ dropout_probability = torch.rand([])
272
+ if dropout_probability < self.layerdrop:
273
+ continue
274
+
275
+ past_key_value = past_key_values[idx] if past_key_values is not None else None
276
+
277
+ if self.gradient_checkpointing and self.training:
278
+ layer_outputs = self._gradient_checkpointing_func(
279
+ decoder_layer.__call__,
280
+ hidden_states,
281
+ attention_mask,
282
+ encoder_hidden_states,
283
+ encoder_attention_mask,
284
+ head_mask[idx] if head_mask is not None else None,
285
+ cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None,
286
+ None,
287
+ output_attentions,
288
+ use_cache,
289
+ )
290
+ else:
291
+ layer_outputs = decoder_layer(
292
+ hidden_states,
293
+ attention_mask=attention_mask,
294
+ encoder_hidden_states=encoder_hidden_states,
295
+ encoder_attention_mask=encoder_attention_mask,
296
+ layer_head_mask=(head_mask[idx] if head_mask is not None else None),
297
+ cross_attn_layer_head_mask=(
298
+ cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None
299
+ ),
300
+ past_key_value=past_key_value,
301
+ output_attentions=output_attentions,
302
+ use_cache=use_cache,
303
+ )
304
+ hidden_states = layer_outputs[0]
305
+
306
+ if use_cache:
307
+ next_decoder_cache += (layer_outputs[3 if output_attentions else 1],)
308
+
309
+ if output_attentions:
310
+ all_self_attns += (layer_outputs[1],)
311
+
312
+ if encoder_hidden_states is not None:
313
+ all_cross_attentions += (layer_outputs[2],)
314
+
315
+ hidden_states = self.layer_norm(hidden_states)
316
+
317
+ # add hidden states from the last decoder layer
318
+ if output_hidden_states:
319
+ all_hidden_states += (hidden_states,)
320
+
321
+ next_cache = next_decoder_cache if use_cache else None
322
+ if not return_dict:
323
+ return tuple(
324
+ v
325
+ for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_cross_attentions]
326
+ if v is not None
327
+ )
328
+ return BaseModelOutputWithPastAndCrossAttentions(
329
+ last_hidden_state=hidden_states,
330
+ past_key_values=next_cache,
331
+ hidden_states=all_hidden_states,
332
+ attentions=all_self_attns,
333
+ cross_attentions=all_cross_attentions,
334
+ )
335
+
336
+
337
+ class RadioWithNeck(nn.Module):
338
+ """Vision encoder using RADIO model with custom neck."""
339
+
340
+ def __init__(self, config):
341
+ super().__init__()
342
+ self.config = config
343
+
344
+ self.model_encoder = AutoModel.from_config(config, trust_remote_code=True)
345
+
346
+ # Neck components
347
+ last_hidden_state = 1024
348
+ self.conv1 = nn.Conv1d(1280, last_hidden_state, 1)
349
+ self.layer_norm1 = nn.LayerNorm(last_hidden_state, eps=1e-06, elementwise_affine=True)
350
+ self.conv2 = nn.Conv2d(last_hidden_state, last_hidden_state, kernel_size=(1,4), stride=(1,4), padding=0, bias=False)
351
+ self.layer_norm2 = nn.LayerNorm(last_hidden_state, eps=1e-06, elementwise_affine=True)
352
+ self.sum_proj = nn.Linear(3840, last_hidden_state)
353
+ self.proj_pixshuf = torch.nn.Linear(4096, 1024)
354
+ self.layer_norm3 = nn.LayerNorm(last_hidden_state, eps=1e-06, elementwise_affine=True)
355
+
356
+ def forward(self, pixel_values, output_attentions=False, output_hidden_states=False, return_dict=False, **kwargs):
357
+ radio_output = self.model_encoder(pixel_values)
358
+ summary, feature = radio_output
359
+
360
+
361
+ output = self.conv1(feature.permute(0,2,1)).permute(0,2,1)
362
+ output = self.layer_norm1(output)
363
+
364
+ patch_size = self.config.patch_size
365
+ output = rearrange(output, 'b (h w) d -> b d h w',
366
+ h=pixel_values.shape[-2] // patch_size,
367
+ w=pixel_values.shape[-1] // patch_size)
368
+
369
+ output = self.conv2(output)
370
+ output = rearrange(output, 'b d h w -> b (h w) d')
371
+ output = pixel_shuffle(output)
372
+ output = self.layer_norm2(self.proj_pixshuf(output))
373
+ summary = self.layer_norm3(self.sum_proj(summary))
374
+ output = torch.cat((output, summary.unsqueeze(1)), dim=1)
375
+
376
+ return DonutSwinModelOutput(last_hidden_state=output)
377
+
378
+
379
+ class NemotronParseLightPreTrainedModel(PreTrainedModel):
380
+ """
381
+ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models.
382
+ """
383
+ config_class = NemotronParseLightConfig
384
+ base_model_prefix = "vision_encoder_decoder" # Use VisionEncoderDecoder prefix
385
+ main_input_name = "pixel_values"
386
+ supports_gradient_checkpointing = True
387
+ _no_split_modules = ["RadioWithNeck", "MBartDecoder"]
388
+ _skip_keys_device_placement = "past_key_values"
389
+
390
+ def _init_weights(self, module):
391
+ """Initialize the weights"""
392
+ if isinstance(module, nn.Linear):
393
+ module.weight.data.normal_(mean=0.0, std=self.config.decoder.init_std)
394
+ if module.bias is not None:
395
+ module.bias.data.zero_()
396
+ elif isinstance(module, nn.Embedding):
397
+ module.weight.data.normal_(mean=0.0, std=self.config.decoder.init_std)
398
+ if module.padding_idx is not None:
399
+ module.weight.data[module.padding_idx].zero_()
400
+
401
+ # Based on transformers.models.encoder_decoder.modeling_encoder_decoder
402
+ class NemotronParseLightForConditionalGeneration(NemotronParseLightPreTrainedModel, GenerationMixin):
403
+ """
404
+ NemotronParseLight model for conditional generation tasks.
405
+
406
+ This model combines a RADIO-based vision encoder with an mBART-based text decoder.
407
+ """
408
+
409
+ def __init__(self, config: NemotronParseLightConfig):
410
+ super().__init__(config)
411
+
412
+ self.encoder = RadioWithNeck(config.encoder)
413
+ self.encoder.main_input_name = 'pixel_values'
414
+ self.encoder = self.encoder.to(config.encoder.torch_dtype)
415
+
416
+ self.decoder = NemotronParseLightDecoder(config.decoder)
417
+ self.decoder = self.decoder.to(config.decoder.torch_dtype)
418
+
419
+ self.lm_head = nn.Linear(config.decoder.d_model, config.decoder.vocab_size, bias=False, dtype=config.decoder.torch_dtype)
420
+
421
+ # Extra heads
422
+ num_extra_heads = getattr(config, 'num_extra_heads', 0)
423
+ self.decoder.extra_heads = nn.ModuleList([
424
+ nn.Linear(config.decoder.d_model, config.decoder.d_model)
425
+ for _ in range(num_extra_heads)
426
+ ])
427
+ self.decoder.extra_proj = nn.ModuleList([
428
+ nn.Linear(config.decoder.d_model, config.decoder.d_model)
429
+ for _ in range(num_extra_heads)
430
+ ])
431
+
432
+ # Class token index for loss weighting
433
+ self.class_token_indx_start = getattr(config, 'class_token_start_idx', 50000)
434
+
435
+ self.post_init()
436
+
437
+ def get_encoder(self):
438
+ return self.encoder
439
+
440
+ def get_decoder(self):
441
+ return self.decoder
442
+
443
+ def get_output_embeddings(self):
444
+ return self.lm_head
445
+
446
+ def set_output_embeddings(self, new_embeddings):
447
+ self.lm_head = new_embeddings
448
+
449
+ def get_input_embeddings(self):
450
+ return self.decoder.get_input_embeddings()
451
+
452
+ def forward(
453
+ self,
454
+ pixel_values: Optional[torch.FloatTensor] = None,
455
+ decoder_input_ids: Optional[torch.LongTensor] = None,
456
+ decoder_attention_mask: Optional[torch.BoolTensor] = None,
457
+ encoder_outputs: Optional[Tuple[torch.FloatTensor]] = None,
458
+ past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
459
+ decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
460
+ labels: Optional[torch.LongTensor] = None,
461
+ use_cache: Optional[bool] = None,
462
+ output_attentions: Optional[bool] = None,
463
+ output_hidden_states: Optional[bool] = None,
464
+ return_dict: Optional[bool] = None,
465
+ __subflavors__: Optional[str] = None,
466
+ __keys__: Optional[List[str]] = None,
467
+ return_sample_losses: Optional[torch.FloatTensor] = None,
468
+ **kwargs,
469
+ ) -> Union[Tuple[torch.FloatTensor], Seq2SeqLMOutput]:
470
+
471
+ return_dict = return_dict if return_dict is not None else self.config.use_return_dict
472
+
473
+ kwargs_encoder = {argument: value for argument, value in kwargs.items() if not argument.startswith("decoder_")}
474
+
475
+ kwargs_decoder = {
476
+ argument[len("decoder_") :]: value for argument, value in kwargs.items() if argument.startswith("decoder_")
477
+ }
478
+
479
+ if encoder_outputs is None:
480
+ if pixel_values is None:
481
+ raise ValueError("You have to specify pixel_values")
482
+
483
+ encoder_outputs = self.encoder(
484
+ pixel_values,
485
+ output_attentions=output_attentions,
486
+ output_hidden_states=output_hidden_states,
487
+ return_dict=return_dict,
488
+ **kwargs_encoder,
489
+ )
490
+
491
+ elif isinstance(encoder_outputs, tuple):
492
+ encoder_outputs = BaseModelOutput(*encoder_outputs)
493
+
494
+ encoder_hidden_states = encoder_outputs[0]
495
+
496
+ encoder_attention_mask = None
497
+
498
+ if (labels is not None) and (decoder_input_ids is None and decoder_inputs_embeds is None):
499
+ decoder_input_ids = shift_tokens_right(
500
+ labels, self.config.pad_token_id, self.config.decoder_start_token_id
501
+ )
502
+
503
+ output_hidden_states = True
504
+
505
+ decoder_outputs = self.decoder(
506
+ input_ids=decoder_input_ids,
507
+ attention_mask=decoder_attention_mask,
508
+ encoder_hidden_states=encoder_hidden_states,
509
+ encoder_attention_mask=encoder_attention_mask,
510
+ inputs_embeds=decoder_inputs_embeds,
511
+ output_attentions=output_attentions,
512
+ output_hidden_states=output_hidden_states,
513
+ use_cache=use_cache,
514
+ past_key_values=past_key_values,
515
+ return_dict=return_dict,
516
+ **kwargs_decoder,
517
+ )
518
+ loss = None
519
+
520
+ if labels is not None:
521
+ main_logits = self.lm_head(decoder_outputs.last_hidden_state)
522
+ logits = [main_logits]
523
+ decoder_inputs_embeds = decoder_outputs.inputs_embeds
524
+ for iii, head in enumerate(self.decoder.extra_heads):
525
+
526
+ decoder_input_embeds_shift = self.decoder.extra_proj[iii](torch.cat((decoder_inputs_embeds[:,1:,:], torch.zeros_like(decoder_inputs_embeds[:,0,:].unsqueeze(1))), axis=1))
527
+ hidden = head(decoder_outputs['hidden_states'][-1] + decoder_input_embeds_shift)
528
+ logits.append(self.lm_head(hidden)) # Use main lm_head, NOT decoder.lm_head
529
+
530
+ logits = torch.stack(logits, dim=-2)
531
+ loss_fct = CrossEntropyLoss(reduction="none")
532
+
533
+ losses_per_head = []
534
+ tokens_per_head = []
535
+ for head_num in range(len(self.decoder.extra_heads)+1):
536
+ logits_head = logits[:,:,head_num,:]
537
+ labels_head = torch.cat(
538
+ (labels[:, head_num:], torch.full_like(labels[:, :head_num], -100)),
539
+ 1
540
+ )
541
+ loss_full = loss_fct(logits_head.permute(0, 2, 1), labels_head)
542
+ loss_full[labels_head >= self.class_token_indx_start] *= 10
543
+ losses_per_head.append(loss_full.sum(1))
544
+ tokens_per_head.append((labels_head != -100).sum(1))
545
+
546
+ losses_per_sample = torch.stack(losses_per_head, dim=1).sum(1)
547
+ tokens_per_sample = torch.stack(tokens_per_head, dim=1).sum(1)
548
+ loss = losses_per_sample.sum() / (tokens_per_sample.sum() + 1e-6)
549
+ if return_sample_losses is not None:
550
+ return_sample_losses.copy_(losses_per_sample.detach() / (tokens_per_sample + 1e-6))
551
+
552
+ if not return_dict:
553
+ if loss is not None:
554
+ return (loss,) + decoder_outputs + encoder_outputs
555
+ else:
556
+ return decoder_outputs + encoder_outputs
557
+ output_logits = self.lm_head(decoder_outputs.last_hidden_state)
558
+ return Seq2SeqLMOutput(
559
+ loss=loss,
560
+ logits=output_logits,
561
+ past_key_values=decoder_outputs.past_key_values,
562
+ decoder_hidden_states=decoder_outputs.hidden_states,
563
+ decoder_attentions=decoder_outputs.attentions,
564
+ cross_attentions=decoder_outputs.cross_attentions,
565
+ encoder_last_hidden_state=encoder_outputs.last_hidden_state,
566
+ encoder_hidden_states=encoder_outputs.hidden_states,
567
+ encoder_attentions=encoder_outputs.attentions,
568
+ )
569
+
570
+ def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
571
+ return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id)
572
+
573
+
574
+ def resize_token_embeddings(self, new_num_tokens: Optional[int] = None, pad_to_multiple_of: Optional[int] = None):
575
+ """Resize token embeddings and update lm_head accordingly."""
576
+ # Resize decoder embeddings
577
+ new_embeddings = self.decoder.resize_token_embeddings(new_num_tokens, pad_to_multiple_of)
578
+
579
+ # Update lm_head to match new vocab size
580
+ if new_embeddings is not None:
581
+ old_vocab_size, hidden_size = self.lm_head.weight.shape
582
+ new_vocab_size = new_embeddings.num_embeddings
583
+
584
+ if old_vocab_size != new_vocab_size:
585
+ print(f"Resizing lm_head from {old_vocab_size} to {new_vocab_size} tokens")
586
+ new_lm_head = nn.Linear(hidden_size, new_vocab_size, bias=False, device=self.lm_head.weight.device, dtype=self.lm_head.weight.dtype)
587
+
588
+ # Copy old weights to new lm_head
589
+ num_tokens_to_copy = min(old_vocab_size, new_vocab_size)
590
+ new_lm_head.weight.data[:num_tokens_to_copy] = self.lm_head.weight.data[:num_tokens_to_copy]
591
+
592
+ # Update reference
593
+ self.lm_head = new_lm_head
594
+ # DO NOT update decoder.lm_head - keep them separate
595
+
596
+ return new_embeddings
597
+
598
+ def _reorder_cache(self, past_key_values, beam_idx):
599
+ # apply decoder cache reordering here
600
+ return self.decoder._reorder_cache(past_key_values, beam_idx)
601
+
602
+
603
+ # Copied from transformers.models.encoder_decoder.modeling_encoder_decoder.shift_tokens_right
604
+ def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int):
605
+ """
606
+ Shift input ids one token to the right.
607
+ """
608
+ shifted_input_ids = input_ids.new_zeros(input_ids.shape)
609
+ shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
610
+ if decoder_start_token_id is None:
611
+ raise ValueError("Make sure to set the decoder_start_token_id attribute of the model's configuration.")
612
+ shifted_input_ids[:, 0] = decoder_start_token_id
613
+
614
+ if pad_token_id is None:
615
+ raise ValueError("Make sure to set the pad_token_id attribute of the model's configuration.")
616
+ # replace possible -100 values in labels by `pad_token_id`
617
+ shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
618
+
619
+ return shifted_input_ids
hf_nemotron_parse_processor.py ADDED
@@ -0,0 +1,376 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from PIL import Image
3
+ from typing import List, Optional, Union, Dict, Any
4
+ import torch
5
+ from torchvision import transforms as T
6
+ import albumentations as A
7
+ import cv2
8
+ import json
9
+
10
+ from transformers import ProcessorMixin, BaseImageProcessor, ImageProcessingMixin
11
+ from transformers.tokenization_utils_base import BatchEncoding
12
+ from transformers.image_utils import ChannelDimension, ImageInput, PILImageResampling, infer_channel_dimension_format
13
+ from transformers.utils import TensorType
14
+
15
+
16
+ class NemotronParseLightImageProcessor(BaseImageProcessor, ImageProcessingMixin):
17
+ """
18
+ Image processor for NemotronParseLight model.
19
+
20
+ This processor inherits from BaseImageProcessor to be compatible with transformers AutoImageProcessor.
21
+ """
22
+
23
+ model_input_names = ["pixel_values"]
24
+
25
+ def __init__(
26
+ self,
27
+ final_size: tuple = (2048, 1664),
28
+ **kwargs,
29
+ ):
30
+ clean_kwargs = {}
31
+ for k, v in kwargs.items():
32
+ if not k.startswith('_') and k not in ['transform', 'torch_transform']:
33
+ clean_kwargs[k] = v
34
+
35
+ if 'size' in clean_kwargs:
36
+ size_config = clean_kwargs.pop('size')
37
+ if isinstance(size_config, dict):
38
+ if 'longest_edge' in size_config:
39
+ longest_edge = size_config['longest_edge']
40
+ if isinstance(longest_edge, (list, tuple)):
41
+ final_size = tuple(int(x) for x in longest_edge)
42
+ else:
43
+ final_size = (int(longest_edge), int(longest_edge))
44
+ elif 'height' in size_config and 'width' in size_config:
45
+ final_size = (int(size_config['height']), int(size_config['width']))
46
+
47
+ super().__init__(**clean_kwargs)
48
+
49
+ if isinstance(final_size, (list, tuple)) and len(final_size) >= 2:
50
+ self.final_size = (int(final_size[0]), int(final_size[1]))
51
+ elif isinstance(final_size, (int, float)):
52
+ self.final_size = (int(final_size), int(final_size))
53
+ else:
54
+ self.final_size = (2048, 1664) # Default fallback
55
+
56
+ self._create_transforms()
57
+
58
+ def _create_transforms(self):
59
+ """Create transform objects (not serialized to JSON)."""
60
+ if isinstance(self.final_size, (list, tuple)):
61
+ self.target_height, self.target_width = int(self.final_size[0]), int(self.final_size[1])
62
+ else:
63
+ self.target_height = self.target_width = int(self.final_size)
64
+
65
+ self.transform = A.Compose([
66
+ A.PadIfNeeded(
67
+ min_height=self.target_height,
68
+ min_width=self.target_width,
69
+ border_mode=cv2.BORDER_CONSTANT,
70
+ value=[255, 255, 255],
71
+ p=1.0
72
+ ),
73
+ ])
74
+
75
+ self.torch_transform = T.Compose([
76
+ T.ToTensor(),
77
+ # Note: Normalization is done within RADIO model
78
+ ])
79
+
80
+ def to_dict(self):
81
+ """Override to exclude non-serializable transforms."""
82
+ output = super().to_dict()
83
+ output.pop('transform', None)
84
+ output.pop('torch_transform', None)
85
+ return output
86
+
87
+ @classmethod
88
+ def from_dict(cls, config_dict: dict, **kwargs):
89
+ """Override to recreate transforms after loading."""
90
+ config_dict = config_dict.copy()
91
+ config_dict.pop('transform', None)
92
+ config_dict.pop('torch_transform', None)
93
+
94
+ # Clean any problematic entries
95
+ for key in list(config_dict.keys()):
96
+ if key.startswith('_') or config_dict[key] is None:
97
+ config_dict.pop(key, None)
98
+
99
+ # Ensure numeric types are correct
100
+ if 'final_size' in config_dict:
101
+ final_size = config_dict['final_size']
102
+ if isinstance(final_size, (list, tuple)):
103
+ config_dict['final_size'] = tuple(int(x) for x in final_size)
104
+
105
+ try:
106
+ return cls(**config_dict, **kwargs)
107
+ except Exception as e:
108
+ print(f"Warning: Error in from_dict: {e}")
109
+ print("Using default parameters...")
110
+ return cls(**kwargs)
111
+
112
+ def save_pretrained(self, save_directory, **kwargs):
113
+ """Save image processor configuration."""
114
+ import os
115
+ import json
116
+
117
+ os.makedirs(save_directory, exist_ok=True)
118
+
119
+ # Save preprocessor config in standard HuggingFace format
120
+ config = {
121
+ "feature_extractor_type": "NemotronParseLightImageProcessor",
122
+ "image_processor_type": "NemotronParseLightImageProcessor",
123
+ "processor_class": "NemotronParseLightImageProcessor",
124
+ "size": {
125
+ "height": self.final_size[0],
126
+ "width": self.final_size[1],
127
+ "longest_edge": self.final_size
128
+ },
129
+ "final_size": self.final_size,
130
+ }
131
+
132
+ config_path = os.path.join(save_directory, "preprocessor_config.json")
133
+ with open(config_path, 'w') as f:
134
+ json.dump(config, f, indent=2)
135
+
136
+ def _resize_with_aspect_ratio(self, image: np.ndarray) -> np.ndarray:
137
+ """Resize image maintaining aspect ratio (exact replica of original LongestMaxSizeHW)."""
138
+ height, width = image.shape[:2]
139
+ max_size_height = self.target_height
140
+ max_size_width = self.target_width
141
+
142
+ # Original LongestMaxSizeHW algorithm from custom_augmentations.py
143
+ aspect_ratio = width / height
144
+ new_height = height
145
+ new_width = width
146
+
147
+ if height > max_size_height:
148
+ new_height = max_size_height
149
+ new_width = int(new_height * aspect_ratio)
150
+
151
+ if new_width > max_size_width:
152
+ new_width = max_size_width
153
+ new_height = int(new_width / aspect_ratio)
154
+
155
+ return cv2.resize(image, (new_width, new_height), interpolation=cv2.INTER_LINEAR)
156
+
157
+ def _pad_to_size(self, image: np.ndarray) -> np.ndarray:
158
+ """Pad image to target size with white padding (matches A.PadIfNeeded behavior)."""
159
+ h, w = image.shape[:2]
160
+ min_height, min_width = self.target_height, self.target_width
161
+
162
+ pad_h = max(0, min_height - h)
163
+ pad_w = max(0, min_width - w)
164
+
165
+ if pad_h == 0 and pad_w == 0:
166
+ return image
167
+
168
+ if len(image.shape) == 3:
169
+ padded = np.pad(
170
+ image,
171
+ ((0, pad_h), (0, pad_w), (0, 0)),
172
+ mode='constant',
173
+ constant_values=255
174
+ )
175
+ else:
176
+ padded = np.pad(
177
+ image,
178
+ ((0, pad_h), (0, pad_w)),
179
+ mode='constant',
180
+ constant_values=255
181
+ )
182
+
183
+ return padded
184
+
185
+ def preprocess(
186
+ self,
187
+ images: ImageInput,
188
+ return_tensors: Optional[Union[str, TensorType]] = None,
189
+ **kwargs,
190
+ ) -> Dict[str, torch.Tensor]:
191
+ """
192
+ Preprocess an image or batch of images for the NemotronParseLight model.
193
+
194
+ Args:
195
+ images: Input image(s)
196
+ return_tensors: Type of tensors to return
197
+ """
198
+
199
+ # Ensure images is a list
200
+ if not isinstance(images, list):
201
+ images = [images]
202
+
203
+ # Convert PIL images to numpy arrays if needed
204
+ processed_images = []
205
+ for image in images:
206
+ if isinstance(image, Image.Image):
207
+ image = np.asarray(image)
208
+ processed_images.append(image)
209
+
210
+ # Apply NemotronParseLight-specific transforms
211
+ pixel_values = []
212
+ for image in processed_images:
213
+ processed_image = self._resize_with_aspect_ratio(image)
214
+
215
+ if self.transform is not None:
216
+ transformed = self.transform(image=processed_image)
217
+ processed_image = transformed["image"]
218
+ else:
219
+ # Fallback: just pad to target size
220
+ processed_image = self._pad_to_size(processed_image)
221
+
222
+ pixel_values_tensor = self.torch_transform(processed_image)
223
+
224
+ if pixel_values_tensor.shape[0] == 1:
225
+ pixel_values_tensor = pixel_values_tensor.expand(3, -1, -1)
226
+
227
+ pixel_values.append(pixel_values_tensor)
228
+
229
+ pixel_values = torch.stack(pixel_values)
230
+
231
+ data = {"pixel_values": pixel_values}
232
+
233
+ if return_tensors is not None:
234
+ data = self._convert_output_format(data, return_tensors)
235
+
236
+ return data
237
+
238
+ def _convert_output_format(self, data: Dict[str, torch.Tensor], return_tensors: Union[str, TensorType]) -> Dict:
239
+ """Convert output format based on return_tensors parameter."""
240
+ if return_tensors == "pt" or return_tensors == TensorType.PYTORCH:
241
+ return data
242
+ elif return_tensors == "np" or return_tensors == TensorType.NUMPY:
243
+ return {k: v.numpy() for k, v in data.items()}
244
+ else:
245
+ return data
246
+
247
+ def __call__(self, images: Union[Image.Image, List[Image.Image]], **kwargs) -> Dict[str, torch.Tensor]:
248
+ """Process images for the model (backward compatibility)."""
249
+ return self.preprocess(images, **kwargs)
250
+
251
+
252
+ class NemotronParseLightProcessor(ProcessorMixin):
253
+
254
+ attributes = ["image_processor", "tokenizer"]
255
+ image_processor_class = "NemotronParseLightImageProcessor"
256
+ tokenizer_class = ("PreTrainedTokenizer", "PreTrainedTokenizerFast")
257
+
258
+ def __init__(self, image_processor=None, tokenizer=None, **kwargs):
259
+ if image_processor is None:
260
+ image_processor = NemotronParseLightImageProcessor(**kwargs)
261
+
262
+ super().__init__(image_processor, tokenizer)
263
+
264
+
265
+ def __call__(
266
+ self,
267
+ images: Union[Image.Image, List[Image.Image]] = None,
268
+ text: Union[str, List[str]] = None,
269
+ add_special_tokens: bool = True,
270
+ padding: Union[bool, str] = False,
271
+ truncation: Union[bool, str] = False,
272
+ max_length: Optional[int] = None,
273
+ stride: int = 0,
274
+ pad_to_multiple_of: Optional[int] = None,
275
+ return_attention_mask: Optional[bool] = None,
276
+ return_overflowing_tokens: bool = False,
277
+ return_special_tokens_mask: bool = False,
278
+ return_offsets_mapping: bool = False,
279
+ return_token_type_ids: bool = False,
280
+ return_length: bool = False,
281
+ verbose: bool = True,
282
+ return_tensors: Optional[Union[str, "TensorType"]] = None,
283
+ **kwargs
284
+ ) -> BatchEncoding:
285
+ """
286
+ Main method to prepare for the model one or several text(s) and image(s).
287
+ """
288
+
289
+ # Process images
290
+ if images is not None:
291
+ image_inputs = self.image_processor(images, **kwargs)
292
+ else:
293
+ image_inputs = {}
294
+
295
+ # Process text
296
+ if text is not None:
297
+ text_inputs = self.tokenizer(
298
+ text,
299
+ add_special_tokens=add_special_tokens,
300
+ padding=padding,
301
+ truncation=truncation,
302
+ max_length=max_length,
303
+ stride=stride,
304
+ pad_to_multiple_of=pad_to_multiple_of,
305
+ return_attention_mask=return_attention_mask,
306
+ return_overflowing_tokens=return_overflowing_tokens,
307
+ return_special_tokens_mask=return_special_tokens_mask,
308
+ return_offsets_mapping=return_offsets_mapping,
309
+ return_token_type_ids=return_token_type_ids,
310
+ return_length=return_length,
311
+ verbose=verbose,
312
+ return_tensors=return_tensors,
313
+ **kwargs,
314
+ )
315
+ else:
316
+ text_inputs = {}
317
+
318
+ # Combine inputs
319
+ return BatchEncoding({**image_inputs, **text_inputs})
320
+
321
+ def decode(self, *args, **kwargs):
322
+ """Decode token ids to strings."""
323
+ return self.tokenizer.decode(*args, **kwargs)
324
+
325
+ def batch_decode(self, *args, **kwargs):
326
+ """Batch decode token ids to strings."""
327
+ return self.tokenizer.batch_decode(*args, **kwargs)
328
+
329
+ def post_process_generation(self, sequences, fix_markdown=False):
330
+ """Post-process generated sequences."""
331
+ if hasattr(self.tokenizer, 'post_process_generation'):
332
+ return self.tokenizer.post_process_generation(sequences, fix_markdown=fix_markdown)
333
+ else:
334
+ # Fallback processing
335
+ if isinstance(sequences, str):
336
+ sequences = [sequences]
337
+
338
+ processed = []
339
+ for seq in sequences:
340
+ # Basic cleaning
341
+ seq = seq.replace('<s>', '').replace('</s>', '').strip()
342
+ processed.append(seq)
343
+
344
+ return processed[0] if len(processed) == 1 else processed
345
+
346
+ @classmethod
347
+ def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
348
+ """
349
+ Load processor from pretrained model.
350
+
351
+ This method is compatible with AutoProcessor.from_pretrained().
352
+ """
353
+ # Use the parent class's from_pretrained method which handles auto-loading
354
+ return super().from_pretrained(pretrained_model_name_or_path, **kwargs)
355
+
356
+ def save_pretrained(self, save_directory, **kwargs):
357
+ """
358
+ Save processor to directory.
359
+
360
+ This method is compatible with AutoProcessor/AutoImageProcessor loading.
361
+ """
362
+ import os
363
+ os.makedirs(save_directory, exist_ok=True)
364
+
365
+ # Save tokenizer with proper configuration for AutoTokenizer
366
+ print("Saving tokenizer for AutoTokenizer compatibility...")
367
+ self.tokenizer.save_pretrained(save_directory, **kwargs)
368
+
369
+ # Save image processor
370
+ print("Saving image processor...")
371
+ self.image_processor.save_pretrained(save_directory, **kwargs)
372
+
373
+ # Use the parent class's save_pretrained method for processor config
374
+ super().save_pretrained(save_directory, **kwargs)
375
+ print(f"NemotronParseLightProcessor saved to {save_directory}")
376
+ print(f"AutoTokenizer.from_pretrained('{save_directory}') should now work!")
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6390e2fef454cf909f6777a786d81b3cfc404189c4c037060d0670a88d1a0312
3
+ size 3843898016
preprocessor_config.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "feature_extractor_type": "NemotronParseLightImageProcessor",
3
+ "image_processor_type": "NemotronParseLightImageProcessor",
4
+ "processor_class": "NemotronParseLightProcessor",
5
+ "do_normalize": false,
6
+ "do_rescale": true,
7
+ "rescale_factor": 0.00392156862745098,
8
+ "size": {
9
+ "height": 2048,
10
+ "width": 1664,
11
+ "longest_edge": [
12
+ 2048,
13
+ 1664
14
+ ]
15
+ },
16
+ "final_size": [
17
+ 2048,
18
+ 1664
19
+ ]
20
+ }
special_tokens_map.json ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": [
3
+ {
4
+ "content": "<predict_classes>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false
9
+ }
10
+ ],
11
+ "bos_token": {
12
+ "content": "<s>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false
17
+ },
18
+ "eos_token": {
19
+ "content": "</s>",
20
+ "lstrip": false,
21
+ "normalized": false,
22
+ "rstrip": false,
23
+ "single_word": false
24
+ },
25
+ "pad_token": {
26
+ "content": "<pad>",
27
+ "lstrip": false,
28
+ "normalized": false,
29
+ "rstrip": false,
30
+ "single_word": false
31
+ },
32
+ "unk_token": {
33
+ "content": "<unk>",
34
+ "lstrip": false,
35
+ "normalized": false,
36
+ "rstrip": false,
37
+ "single_word": false
38
+ }
39
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
The diff for this file is too large to render. See raw diff