File size: 25,730 Bytes
a3a3978
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>Voxtral ASR Fine-tuning - Architecture Diagrams</title>
    <script type="module">
        import mermaid from 'https://cdn.jsdelivr.net/npm/mermaid@10.6.1/dist/mermaid.esm.min.mjs';
        mermaid.initialize({
            startOnLoad: true,
            theme: 'base',
            themeVariables: {
                primaryColor: '#e3f2fd',
                primaryTextColor: '#1976d2',
                primaryBorderColor: '#01579b',
                lineColor: '#424242',
                secondaryColor: '#fff3e0',
                tertiaryColor: '#fce4ec',
                background: '#ffffff',
                mainBkg: '#ffffff',
                secondBkg: '#f5f5f5',
                textColor: '#333333'
            },
            flowchart: {
                useMaxWidth: true,
                htmlLabels: true,
                curve: 'basis'
            },
            sequence: {
                useMaxWidth: true
            }
        });
    </script>
    <style>
        body {
            font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
            line-height: 1.6;
            color: #333;
            max-width: 1200px;
            margin: 0 auto;
            padding: 20px;
            background: #f8f9fa;
        }

        .header {
            text-align: center;
            margin-bottom: 40px;
            padding: 20px;
            background: white;
            border-radius: 8px;
            box-shadow: 0 2px 4px rgba(0,0,0,0.1);
        }

        .diagram-container {
            background: white;
            margin: 20px 0;
            padding: 20px;
            border-radius: 8px;
            box-shadow: 0 2px 4px rgba(0,0,0,0.1);
        }

        .diagram-title {
            font-size: 1.5em;
            font-weight: bold;
            margin-bottom: 15px;
            color: #1976d2;
            border-bottom: 2px solid #e3f2fd;
            padding-bottom: 10px;
        }

        .diagram-description {
            margin-bottom: 20px;
            color: #666;
            font-style: italic;
        }

        .navigation {
            position: fixed;
            top: 20px;
            right: 20px;
            background: white;
            padding: 15px;
            border-radius: 8px;
            box-shadow: 0 2px 4px rgba(0,0,0,0.1);
            max-width: 200px;
        }

        .nav-link {
            display: block;
            padding: 8px 0;
            color: #1976d2;
            text-decoration: none;
            border-bottom: 1px solid #eee;
        }

        .nav-link:hover {
            color: #01579b;
            text-decoration: underline;
        }

        .nav-link:last-child {
            border-bottom: none;
        }

        .code-toggle {
            background: #f5f5f5;
            border: 1px solid #ddd;
            padding: 10px;
            margin: 10px 0;
            border-radius: 4px;
            cursor: pointer;
            font-size: 0.9em;
        }

        .mermaid-code {
            display: none;
            background: #f8f9fa;
            border: 1px solid #dee2e6;
            border-radius: 4px;
            padding: 15px;
            margin: 10px 0;
            font-family: 'Courier New', monospace;
            font-size: 0.85em;
            white-space: pre-wrap;
            overflow-x: auto;
        }

        .download-btn {
            background: #1976d2;
            color: white;
            border: none;
            padding: 8px 16px;
            border-radius: 4px;
            cursor: pointer;
            font-size: 0.9em;
            margin: 10px 5px 10px 0;
        }

        .download-btn:hover {
            background: #01579b;
        }

        @media print {
            .navigation, .code-toggle, .download-btn {
                display: none;
            }
            .diagram-container {
                break-inside: avoid;
                margin: 10px 0;
            }
        }
    </style>
</head>
<body>
    <div class="header">
        <h1>🎯 Voxtral ASR Fine-tuning</h1>
        <h2>Architecture & Workflow Diagrams</h2>
        <p>Interactive documentation with Mermaid diagrams</p>
    </div>

    <nav class="navigation">
        <strong>Quick Navigation</strong>
        <a href="#overview" class="nav-link">Overview</a>
        <a href="#architecture" class="nav-link">Architecture</a>
        <a href="#interface" class="nav-link">Interface Workflow</a>
        <a href="#training" class="nav-link">Training Pipeline</a>
        <a href="#deployment" class="nav-link">Deployment Pipeline</a>
        <a href="#dataflow" class="nav-link">Data Flow</a>
    </nav>

    <div id="overview" class="diagram-container">
        <div class="diagram-title">πŸ“‹ Documentation Overview</div>
        <div class="diagram-description">
            High-level overview of the Voxtral ASR Fine-tuning application and its documentation structure.
        </div>
        <div class="mermaid">
graph TD
    START(["Voxtral ASR Fine-tuning App"]) --> OVERVIEW{Choose Documentation}

    OVERVIEW --> ARCH["Architecture Overview"]
    OVERVIEW --> WORKFLOW["Interface Workflow"]
    OVERVIEW --> TRAINING["Training Pipeline"]
    OVERVIEW --> DEPLOYMENT["Deployment Pipeline"]
    OVERVIEW --> DATAFLOW["Data Flow"]

    ARCH --> ARCH_DIAG["High-level Architecture<br/>System Components & Layers"]
    WORKFLOW --> WORKFLOW_DIAG["User Journey<br/>Recording β†’ Training β†’ Demo"]
    TRAINING --> TRAINING_DIAG["Training Scripts<br/>Data β†’ Model β†’ Results"]
    DEPLOYMENT --> DEPLOYMENT_DIAG["Publishing & Demo<br/>Model β†’ Hub β†’ Space"]
    DATAFLOW --> DATAFLOW_DIAG["Complete Data Journey<br/>Input β†’ Processing β†’ Output"]

    subgraph "Core Components"
        INTERFACE["interface.py<br/>Gradio Web UI"]
        TRAIN_SCRIPTS["scripts/train*.py<br/>Training Scripts"]
        DEPLOY_SCRIPT["scripts/deploy_demo_space.py<br/>Demo Deployment"]
        PUSH_SCRIPT["scripts/push_to_huggingface.py<br/>Model Publishing"]
    end

    subgraph "Key Data Formats"
        JSONL["JSONL Dataset<br/>{'audio_path': '...', 'text': '...'}"]
        HFDATA["HF Hub Models<br/>username/model-name"]
        SPACES["HF Spaces<br/>Interactive Demos"]
    end

    INTERFACE --> WORKFLOW
    TRAIN_SCRIPTS --> TRAINING
    DEPLOY_SCRIPT --> DEPLOYMENT
    PUSH_SCRIPT --> DEPLOYMENT

    JSONL --> DATAFLOW
    HFDATA --> DEPLOYMENT
    SPACES --> DEPLOYMENT

    classDef entry fill:#e3f2fd,stroke:#1976d2,stroke-width:3px
    classDef category fill:#fff3e0,stroke:#f57c00,stroke-width:2px
    classDef diagram fill:#e8f5e8,stroke:#388e3c,stroke-width:2px
    classDef component fill:#f3e5f5,stroke:#7b1fa2,stroke-width:2px
    classDef data fill:#e1f5fe,stroke:#0277bd,stroke-width:2px

    class START entry
    class OVERVIEW,ARCH,WORKFLOW,TRAINING,DEPLOYMENT,DATAFLOW category
    class ARCH_DIAG,WORKFLOW_DIAG,TRAINING_DIAG,DEPLOYMENT_DIAG,DATAFLOW_DIAG diagram
    class INTERFACE,TRAIN_SCRIPTS,DEPLOY_SCRIPT,PUSH_SCRIPT component
    class JSONL,HFDATA,SPACES data
        </div>
    </div>

    <div id="architecture" class="diagram-container">
        <div class="diagram-title">System Architecture</div>
        <div class="diagram-description">
            High-level architecture showing the main components and their relationships in the Voxtral ASR Fine-tuning application.
        </div>
        <div class="mermaid">
graph TB
    subgraph "User Interface"
        UI["Gradio Web Interface<br/>interface.py"]
        REC["Audio Recording<br/>Microphone Input"]
        UP["File Upload<br/>WAV/FLAC files"]
    end

    subgraph "Data Processing"
        DP["Data Processing<br/>Audio resampling<br/>JSONL creation"]
        DS["Dataset Management<br/>NVIDIA Granary<br/>Local datasets"]
    end

    subgraph "Training Pipeline"
        TF["Full Fine-tuning<br/>scripts/train.py"]
        TL["LoRA Fine-tuning<br/>scripts/train_lora.py"]
        TI["Trackio Integration<br/>Experiment Tracking"]
    end

    subgraph "Model Management"
        MM["Model Management<br/>Hugging Face Hub<br/>Local storage"]
        MC["Model Card Generation<br/>scripts/generate_model_card.py"]
    end

    subgraph "Deployment &amp; Demo"
        DEP["Demo Space Deployment<br/>scripts/deploy_demo_space.py"]
        HF["HF Spaces<br/>Interactive Demo"]
    end

    subgraph "External Services"
        HFH["Hugging Face Hub<br/>Models & Datasets"]
        GRAN["NVIDIA Granary<br/>Multilingual ASR Dataset"]
        TRACK["Trackio Spaces<br/>Experiment Tracking"]
    end

    UI --> DP
    REC --> DP
    UP --> DP
    DP --> DS

    DS --> TF
    DS --> TL
    TF --> TI
    TL --> TI

    TF --> MM
    TL --> MM
    MM --> MC

    MM --> DEP
    DEP --> HF

    DS -.-> HFH
    MM -.-> HFH
    TI -.-> TRACK
    DS -.-> GRAN

    classDef interface fill:#e1f5fe,stroke:#01579b,stroke-width:2px
    classDef processing fill:#f3e5f5,stroke:#4a148c,stroke-width:2px
    classDef training fill:#e8f5e8,stroke:#1b5e20,stroke-width:2px
    classDef management fill:#fff3e0,stroke:#e65100,stroke-width:2px
    classDef deployment fill:#fce4ec,stroke:#880e4f,stroke-width:2px
    classDef external fill:#f5f5f5,stroke:#424242,stroke-width:2px

    class UI,REC,UP interface
    class DP,DS processing
    class TF,TL,TI training
    class MM,MC management
    class DEP,HF deployment
    class HFH,GRAN,TRACK external
        </div>
    </div>

    <div id="interface" class="diagram-container">
        <div class="diagram-title">Interface Workflow</div>
        <div class="diagram-description">
            Complete user journey through the Voxtral ASR Fine-tuning interface, from language selection to demo deployment.
        </div>
        <div class="mermaid">
flowchart TD
    START(["User Opens Interface"]) --> LANG["Language Selection<br/>Choose from 25+ languages"]
    LANG --> PHRASES["Load Phrases<br/>From NVIDIA Granary"]
    PHRASES --> RECORD["Recording Interface<br/>Display phrases + audio recording"]

    RECORD --> |User Records| PROCESS_REC["Process Recordings<br/>Save WAV files + transcripts"]
    RECORD --> |Upload Files| PROCESS_UPLOAD["Process Uploads<br/>Handle existing files + transcripts"]

    PROCESS_REC --> JSONL["Create JSONL Dataset<br/>{'audio_path': '...', 'text': '...'}"]
    PROCESS_UPLOAD --> JSONL

    JSONL --> CONFIG["Training Configuration<br/>Model, LoRA/full, hyperparameters"]
    CONFIG --> TRAIN["Training Process<br/>Execute train.py or train_lora.py"]

    TRAIN --> PUSH["Push to Hub<br/>Model + metadata to HF Hub"]
    TRAIN --> CARD["Generate Model Card<br/>Automated documentation"]
    PUSH --> DEPLOY["Deploy Demo Space<br/>Interactive demo on HF Spaces"]

    DEPLOY --> END(["Demo Ready<br/>Interactive ASR Demo"])

    PUSH -.-> END
    CARD -.-> END

    classDef start fill:#e3f2fd,stroke:#1976d2,stroke-width:3px
    classDef process fill:#f3e5f5,stroke:#7b1fa2,stroke-width:2px
    classDef decision fill:#fff3e0,stroke:#f57c00,stroke-width:2px
    classDef terminal fill:#e8f5e8,stroke:#388e3c,stroke-width:3px

    class START start
    class END terminal
    class LANG,PHRASES,RECORD,PROCESS_REC,PROCESS_UPLOAD,JSONL,CONFIG,TRAIN,PUSH,CARD,DEPLOY process
        </div>
    </div>

    <div id="training" class="diagram-container">
        <div class="diagram-title">Training Pipeline</div>
        <div class="diagram-description">
            Detailed training pipeline showing how data flows through training scripts and supporting infrastructure.
        </div>
        <div class="mermaid">
graph TB
    subgraph "Data Sources"
        JSONL["JSONL Dataset<br/>{'audio_path': '...', 'text': '...'}"]
        GRANARY["NVIDIA Granary Dataset<br/>Multilingual ASR Data"]
        HFDATA["HF Hub Datasets<br/>Community Datasets"]
    end

    subgraph "Data Processing"
        LOADER["Dataset Loader<br/>_load_jsonl_dataset()"]
        CASTER["Audio Casting<br/>16kHz resampling"]
        COLLATOR["VoxtralDataCollator<br/>Audio + Text Processing"]
    end

    subgraph "Training Scripts"
        TRAIN_FULL["Full Fine-tuning<br/>scripts/train.py"]
        TRAIN_LORA["LoRA Fine-tuning<br/>scripts/train_lora.py"]

        subgraph "Training Components"
            MODEL_INIT["Model Initialization<br/>VoxtralForConditionalGeneration"]
            LORA_CONFIG["LoRA Configuration<br/>LoraConfig + get_peft_model"]
            PROCESSOR_INIT["Processor Initialization<br/>VoxtralProcessor"]
        end
    end

    subgraph "Training Infrastructure"
        TRACKIO_INIT["Trackio Integration<br/>Experiment Tracking"]
        HF_TRAINER["Hugging Face Trainer<br/>TrainingArguments + Trainer"]
        TORCH_DEVICE["Torch Device Setup<br/>GPU/CPU Detection"]
    end

    subgraph "Training Process"
        FORWARD_PASS["Forward Pass<br/>Audio Processing + Generation"]
        LOSS_CALC["Loss Calculation<br/>Masked Language Modeling"]
        BACKWARD_PASS["Backward Pass<br/>Gradient Computation"]
        OPTIMIZER_STEP["Optimizer Step<br/>Parameter Updates"]
        LOGGING["Metrics Logging<br/>Loss, Perplexity, etc."]
    end

    subgraph "Model Management"
        CHECKPOINT_SAVING["Checkpoint Saving<br/>Model snapshots"]
        MODEL_SAVING["Final Model Saving<br/>Processor + Model"]
        LOCAL_STORAGE["Local Storage<br/>outputs/ directory"]
    end

    LOADER --> CASTER
    CASTER --> COLLATOR

    COLLATOR --> TRAIN_FULL
    COLLATOR --> TRAIN_LORA

    TRAIN_FULL --> MODEL_INIT
    TRAIN_LORA --> MODEL_INIT
    TRAIN_LORA --> LORA_CONFIG

    MODEL_INIT --> PROCESSOR_INIT
    LORA_CONFIG --> PROCESSOR_INIT

    PROCESSOR_INIT --> TRACKIO_INIT
    PROCESSOR_INIT --> HF_TRAINER
    PROCESSOR_INIT --> TORCH_DEVICE

    TRACKIO_INIT --> HF_TRAINER
    TORCH_DEVICE --> HF_TRAINER

    HF_TRAINER --> FORWARD_PASS
    FORWARD_PASS --> LOSS_CALC
    LOSS_CALC --> BACKWARD_PASS
    BACKWARD_PASS --> OPTIMIZER_STEP
    OPTIMIZER_STEP --> LOGGING

    LOGGING --> CHECKPOINT_SAVING
    LOGGING --> TRACKIO_INIT

    HF_TRAINER --> MODEL_SAVING
    MODEL_SAVING --> LOCAL_STORAGE

    JSONL --> LOADER
    GRANARY --> LOADER
    HFDATA --> LOADER

    classDef input fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
    classDef processing fill:#f3e5f5,stroke:#7b1fa2,stroke-width:2px
    classDef training fill:#e8f5e8,stroke:#388e3c,stroke-width:2px
    classDef infrastructure fill:#fff3e0,stroke:#f57c00,stroke-width:2px
    classDef execution fill:#fce4ec,stroke:#c2185b,stroke-width:2px
    classDef output fill:#f5f5f5,stroke:#424242,stroke-width:2px

    class JSONL,GRANARY,HFDATA input
    class LOADER,CASTER,COLLATOR processing
    class TRAIN_FULL,TRAIN_LORA,MODEL_INIT,LORA_CONFIG,PROCESSOR_INIT training
    class TRACKIO_INIT,HF_TRAINER,TORCH_DEVICE infrastructure
    class FORWARD_PASS,LOSS_CALC,BACKWARD_PASS,OPTIMIZER_STEP,LOGGING execution
    class CHECKPOINT_SAVING,MODEL_SAVING,LOCAL_STORAGE output
        </div>
    </div>

    <div id="deployment" class="diagram-container">
        <div class="diagram-title">Deployment Pipeline</div>
        <div class="diagram-description">
            Model publishing and demo deployment process from trained model to live interactive demo.
        </div>
        <div class="mermaid">
graph TB
    subgraph "Inputs"
        TRAINED_MODEL["Trained Model<br/>Local directory"]
        TRAINING_CONFIG["Training Config<br/>JSON/YAML"]
        TRAINING_RESULTS["Training Results<br/>Metrics & logs"]
        MODEL_METADATA["Model Metadata<br/>Name, description, etc."]
    end

    subgraph "Model Publishing"
        PUSH_SCRIPT["push_to_huggingface.py<br/>Model Publisher"]

        subgraph "Publishing Steps"
            REPO_CREATION["Repository Creation<br/>HF Hub API"]
            FILE_UPLOAD["File Upload<br/>Model files to HF"]
            METADATA_UPLOAD["Metadata Upload<br/>Config & results"]
        end
    end

    subgraph "Model Card Generation"
        CARD_SCRIPT["generate_model_card.py<br/>Card Generator"]

        subgraph "Card Components"
            TEMPLATE_LOAD["Template Loading<br/>model_card.md"]
            VARIABLE_REPLACEMENT["Variable Replacement<br/>Config injection"]
            CONDITIONAL_PROCESSING["Conditional Sections<br/>Quantized models, etc."]
        end
    end

    subgraph "Demo Space Deployment"
        DEPLOY_SCRIPT["deploy_demo_space.py<br/>Space Deployer"]

        subgraph "Space Setup"
            SPACE_CREATION["Space Repository<br/>Create HF Space"]
            TEMPLATE_COPY["Template Copying<br/>demo_voxtral/ files"]
            ENV_INJECTION["Environment Setup<br/>Model config injection"]
            SECRET_SETUP["Secret Configuration<br/>HF_TOKEN, model vars"]
        end
    end

    subgraph "Space Building"
        BUILD_TRIGGER[Build Trigger<br/>Automatic build start]
        DEPENDENCY_INSTALL[Dependency Installation<br/>requirements.txt]
        MODEL_DOWNLOAD[Model Download<br/>From HF Hub]
        APP_INITIALIZATION[App Initialization<br/>Gradio app setup]
    end

    subgraph "Live Demo Space"
        GRADIO_INTERFACE[Gradio Interface<br/>Interactive demo]
        MODEL_INFERENCE[Model Inference<br/>Real-time ASR]
        USER_INTERACTION[User Interaction<br/>Audio upload/playback]
    end

    subgraph "External Services"
        HF_HUB[Hugging Face Hub<br/>Model & Space hosting]
        HF_SPACES[HF Spaces Platform<br/>Demo hosting]
    end

    TRAINED_MODEL --> PUSH_SCRIPT
    TRAINING_CONFIG --> PUSH_SCRIPT
    TRAINING_RESULTS --> PUSH_SCRIPT
    MODEL_METADATA --> PUSH_SCRIPT

    PUSH_SCRIPT --> REPO_CREATION
    REPO_CREATION --> FILE_UPLOAD
    FILE_UPLOAD --> METADATA_UPLOAD

    METADATA_UPLOAD --> CARD_SCRIPT
    TRAINING_CONFIG --> CARD_SCRIPT
    TRAINING_RESULTS --> CARD_SCRIPT

    CARD_SCRIPT --> TEMPLATE_LOAD
    TEMPLATE_LOAD --> VARIABLE_REPLACEMENT
    VARIABLE_REPLACEMENT --> CONDITIONAL_PROCESSING

    CONDITIONAL_PROCESSING --> DEPLOY_SCRIPT
    METADATA_UPLOAD --> DEPLOY_SCRIPT

    DEPLOY_SCRIPT --> SPACE_CREATION
    SPACE_CREATION --> TEMPLATE_COPY
    TEMPLATE_COPY --> ENV_INJECTION
    ENV_INJECTION --> SECRET_SETUP

    SECRET_SETUP --> BUILD_TRIGGER
    BUILD_TRIGGER --> DEPENDENCY_INSTALL
    DEPENDENCY_INSTALL --> MODEL_DOWNLOAD
    MODEL_DOWNLOAD --> APP_INITIALIZATION

    APP_INITIALIZATION --> GRADIO_INTERFACE
    GRADIO_INTERFACE --> MODEL_INFERENCE
    MODEL_INFERENCE --> USER_INTERACTION

    HF_HUB --> MODEL_DOWNLOAD
    HF_SPACES --> GRADIO_INTERFACE

    classDef input fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
    classDef publishing fill:#e8f5e8,stroke:#388e3c,stroke-width:2px
    classDef generation fill:#fff3e0,stroke:#f57c00,stroke-width:2px
    classDef deployment fill:#fce4ec,stroke:#c2185b,stroke-width:2px
    classDef building fill:#f3e5f5,stroke:#7b1fa2,stroke-width:2px
    classDef demo fill:#e1f5fe,stroke:#0277bd,stroke-width:2px
    classDef external fill:#f5f5f5,stroke:#424242,stroke-width:2px

    class TRAINED_MODEL,TRAINING_CONFIG,TRAINING_RESULTS,MODEL_METADATA input
    class PUSH_SCRIPT,REPO_CREATION,FILE_UPLOAD,METADATA_UPLOAD publishing
    class CARD_SCRIPT,TEMPLATE_LOAD,VARIABLE_REPLACEMENT,CONDITIONAL_PROCESSING generation
    class DEPLOY_SCRIPT,SPACE_CREATION,TEMPLATE_COPY,ENV_INJECTION,SECRET_SETUP deployment
    class BUILD_TRIGGER,DEPENDENCY_INSTALL,MODEL_DOWNLOAD,APP_INITIALIZATION building
    class GRADIO_INTERFACE,MODEL_INFERENCE,USER_INTERACTION demo
    class HF_HUB,HF_SPACES external
        </div>
    </div>

    <div id="dataflow" class="diagram-container">
        <div class="diagram-title">Data Flow</div>
        <div class="diagram-description">
            Complete data journey through the Voxtral ASR Fine-tuning application from user input to deployed demo.
        </div>
        <div class="mermaid">
flowchart TD
    subgraph "User Input"
        MIC["Microphone Recording<br/>Raw audio + timestamps"]
        FILE["File Upload<br/>WAV/FLAC files"]
        TEXT["Manual Transcripts<br/>Text input"]
        LANG["Language Selection<br/>25+ languages"]
    end

    subgraph "Data Processing"
        AUDIO_PROC["Audio Processing<br/>Resampling to 16kHz<br/>Format conversion"]
        TEXT_PROC["Text Processing<br/>Transcript validation<br/>Cleaning & formatting"]
        JSONL_CONV["JSONL Conversion<br/>{'audio_path': '...', 'text': '...'}"]
    end

    subgraph "Dataset Storage"
        LOCAL_DS["Local Dataset<br/>datasets/voxtral_user/<br/>data.jsonl + wavs/"]
        HF_DS["HF Hub Dataset<br/>username/dataset-name<br/>Public sharing"]
    end

    subgraph "Training Data Pipeline"
        DS_LOADER["Dataset Loader<br/>_load_jsonl_dataset()<br/>or load_dataset()"]
        AUDIO_CAST["Audio Casting<br/>Audio(sampling_rate=16000)"]
        TRAIN_SPLIT["Train Split<br/>train_dataset"]
        EVAL_SPLIT["Eval Split<br/>eval_dataset"]
    end

    subgraph "Model Training"
        COLLATOR["VoxtralDataCollator<br/>Audio + Text batching<br/>Prompt construction"]
        FORWARD["Forward Pass<br/>Audio β†’ Features β†’ Text"]
        LOSS["Loss Calculation<br/>Masked LM loss"]
        BACKWARD["Backward Pass<br/>Gradient computation"]
        OPTIMIZE["Parameter Updates<br/>LoRA or full fine-tuning"]
    end

    subgraph "Training Outputs"
        MODEL_FILES["Model Files<br/>model.safetensors<br/>config.json<br/>tokenizer.json"]
        TRAINING_LOGS["Training Logs<br/>train_results.json<br/>training_config.json<br/>loss curves"]
        CHECKPOINTS["Checkpoints<br/>Intermediate models<br/>best model tracking"]
    end

    subgraph "Publishing Pipeline"
        HF_REPO["HF Repository<br/>username/model-name<br/>Model hosting"]
        MODEL_CARD["Model Card<br/>README.md<br/>Training details<br/>Usage examples"]
        METADATA["Training Metadata<br/>Config + results<br/>Performance metrics"]
    end

    subgraph "Demo Deployment"
        SPACE_REPO["HF Space Repository<br/>username/model-name-demo<br/>Demo hosting"]
        DEMO_APP["Demo Application<br/>Gradio interface<br/>Real-time inference"]
        ENV_VARS["Environment Config<br/>HF_MODEL_ID<br/>MODEL_NAME<br/>secrets"]
    end

    MIC --> AUDIO_PROC
    FILE --> AUDIO_PROC
    TEXT --> TEXT_PROC
    LANG --> TEXT_PROC

    AUDIO_PROC --> JSONL_CONV
    TEXT_PROC --> JSONL_CONV

    JSONL_CONV --> LOCAL_DS
    LOCAL_DS --> HF_DS

    LOCAL_DS --> DS_LOADER
    HF_DS --> DS_LOADER

    DS_LOADER --> AUDIO_CAST
    AUDIO_CAST --> TRAIN_SPLIT
    AUDIO_CAST --> EVAL_SPLIT

    TRAIN_SPLIT --> COLLATOR
    EVAL_SPLIT --> COLLATOR

    COLLATOR --> FORWARD
    FORWARD --> LOSS
    LOSS --> BACKWARD
    BACKWARD --> OPTIMIZE

    OPTIMIZE --> MODEL_FILES
    OPTIMIZE --> TRAINING_LOGS
    OPTIMIZE --> CHECKPOINTS

    MODEL_FILES --> HF_REPO
    TRAINING_LOGS --> HF_REPO
    CHECKPOINTS --> HF_REPO

    HF_REPO --> MODEL_CARD
    TRAINING_LOGS --> MODEL_CARD

    MODEL_CARD --> SPACE_REPO
    HF_REPO --> SPACE_REPO
    ENV_VARS --> SPACE_REPO

    SPACE_REPO --> DEMO_APP

    classDef input fill:#e3f2fd,stroke:#1976d2,stroke-width:2px
    classDef processing fill:#f3e5f5,stroke:#7b1fa2,stroke-width:2px
    classDef storage fill:#fff3e0,stroke:#f57c00,stroke-width:2px
    classDef training fill:#e8f5e8,stroke:#388e3c,stroke-width:2px
    classDef output fill:#fce4ec,stroke:#c2185b,stroke-width:2px
    classDef publishing fill:#e1f5fe,stroke:#0277bd,stroke-width:2px
    classDef deployment fill:#f5f5f5,stroke:#424242,stroke-width:2px

    class MIC,FILE,TEXT,LANG input
    class AUDIO_PROC,TEXT_PROC,JSONL_CONV processing
    class LOCAL_DS,HF_DS storage
    class DS_LOADER,AUDIO_CAST,TRAIN_SPLIT,EVAL_SPLIT,COLLATOR,FORWARD,LOSS,BACKWARD,OPTIMIZE training
    class MODEL_FILES,TRAINING_LOGS,CHECKPOINTS output
    class HF_REPO,MODEL_CARD,METADATA publishing
    class SPACE_REPO,DEMO_APP,ENV_VARS deployment
        </div>
    </div>

    <script>
        // Toggle mermaid code visibility
        function toggleCode(diagramId) {
            const codeBlock = document.querySelector(`#${diagramId} .mermaid-code`);
            if (codeBlock.style.display === 'none' || codeBlock.style.display === '') {
                codeBlock.style.display = 'block';
            } else {
                codeBlock.style.display = 'none';
            }
        }

        // Add toggle buttons to each diagram
        document.addEventListener('DOMContentLoaded', function() {
            const diagrams = document.querySelectorAll('.diagram-container');
            diagrams.forEach((diagram, index) => {
                const diagramId = diagram.id;
                const mermaidDiv = diagram.querySelector('.mermaid');

                if (mermaidDiv) {
                    // Create toggle button
                    const toggleBtn = document.createElement('button');
                    toggleBtn.className = 'code-toggle';
                    toggleBtn.textContent = 'πŸ” Show Mermaid Code';
                    toggleBtn.onclick = () => toggleCode(diagramId);

                    // Create code block
                    const codeBlock = document.createElement('pre');
                    codeBlock.className = 'mermaid-code';
                    codeBlock.textContent = mermaidDiv.textContent.trim();

                    // Insert elements
                    mermaidDiv.parentNode.insertBefore(toggleBtn, mermaidDiv);
                    mermaidDiv.parentNode.insertBefore(codeBlock, mermaidDiv.nextSibling);
                }
            });
        });

        // Print functionality
        function printDiagrams() {
            window.print();
        }
    </script>
</body>
</html>