File size: 54,361 Bytes
d81440f
b180c39
632df2f
3ecb35b
 
e86b23a
632df2f
05b4419
3ecb35b
7324283
 
 
 
 
05b4419
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3ecb35b
 
 
05b4419
 
 
 
 
 
3ecb35b
 
 
05b4419
7324283
e86b23a
502ec94
 
 
 
e86b23a
502ec94
05b4419
 
e86b23a
502ec94
e86b23a
 
 
 
 
 
3ecb35b
 
 
 
 
502ec94
3ecb35b
e86b23a
502ec94
3ecb35b
 
05b4419
e86b23a
502ec94
3ecb35b
 
05b4419
e86b23a
502ec94
3ecb35b
 
05b4419
e86b23a
502ec94
e86b23a
 
 
 
 
 
 
 
 
 
 
 
 
05b4419
b713501
3ecb35b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
502ec94
3ecb35b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
05b4419
 
3ecb35b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
05b4419
3ecb35b
 
 
 
 
 
05b4419
3ecb35b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
05b4419
3ecb35b
 
 
 
 
502ec94
3ecb35b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
05b4419
3ecb35b
 
 
 
 
 
 
 
 
 
 
 
 
05b4419
502ec94
05b4419
 
3ecb35b
 
 
 
05b4419
3ecb35b
 
 
 
 
 
 
 
 
 
 
 
502ec94
3ecb35b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
05b4419
3ecb35b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
05b4419
3ecb35b
05b4419
 
3ecb35b
05b4419
3ecb35b
05b4419
3ecb35b
05b4419
3ecb35b
05b4419
3ecb35b
05b4419
3ecb35b
 
 
05b4419
502ec94
05b4419
 
3ecb35b
b713501
7f130dd
b713501
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c22dbae
b713501
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
502ec94
b713501
 
 
 
 
 
 
 
 
 
 
 
 
 
502ec94
b713501
3ecb35b
 
 
b713501
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3ecb35b
b713501
 
 
 
3ecb35b
b713501
3ecb35b
240c11f
05b4419
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
import json
import re
from utils.logger import log
import jieba
from typing import List, Tuple
import copy
class InfoExtractor:
    def __init__(self):

        self.extraction_schema = {
            "destination": {"type": dict, "fields": {"name": str, "country": str}},
            "duration": {"type": dict, "fields": {"days": int, "description": str}},
            "budget": {"type": dict, "fields": {"type": str, "amount": int, "currency": str, "description": str}}
        }
        
        # 欧洲城市和国家的完整映射关系(聚焦欧洲)
        self.european_cities = {
            # === 西欧 ===
            # 法国
            "巴黎": "法国", "里昂": "法国", "马赛": "法国", "尼斯": "法国", "戛纳": "法国",
            "图卢兹": "法国", "南特": "法国", "斯特拉斯堡": "法国", "蒙彼利埃": "法国", "波尔多": "法国",
            "里尔": "法国", "雷恩": "法国", "兰斯": "法国", "勒阿弗尔": "法国", "圣埃蒂安": "法国",
            "土伦": "法国", "阿维尼翁": "法国", "凡尔赛": "法国", "枫丹白露": "法国", "第戎": "法国",
            "昂热": "法国", "贝桑松": "法国", "佩皮尼昂": "法国", "卢尔德": "法国", "沙特尔": "法国",
            
            # 德国
            "柏林": "德国", "慕尼黑": "德国", "汉堡": "德国", "科隆": "德国", "法兰克福": "德国",
            "斯图加特": "德国", "杜塞尔多夫": "德国", "多特蒙德": "德国", "埃森": "德国", "莱比锡": "德国",
            "不来梅": "德国", "德累斯顿": "德国", "汉诺威": "德国", "纽伦堡": "德国", "杜伊斯堡": "德国",
            "波鸿": "德国", "乌珀塔尔": "德国", "比勒费尔德": "德国", "波恩": "德国", "明斯特": "德国",
            "卡尔斯鲁厄": "德国", "曼海姆": "德国", "奥格斯堡": "德国", "威斯巴登": "德国", "盖尔森基兴": "德国",
            "门兴格拉德巴赫": "德国", "布伦瑞克": "德国", "基尔": "德国", "亚琛": "德国", "哈雷": "德国",
            "马格德堡": "德国", "弗莱堡": "德国", "克里菲尔德": "德国", "吕贝克": "德国", "奥伯豪森": "德国",
            "埃尔福特": "德国", "罗斯托克": "德国", "凯泽斯劳滕": "德国", "卡塞尔": "德国", "哈根": "德国",
            "波茨坦": "德国", "萨尔布吕肯": "德国", "路德维希港": "德国", "奥尔登堡": "德国", "莱沃库森": "德国",
            "奥斯纳布吕克": "德国", "索林根": "德国", "海德堡": "德国", "达姆施塔特": "德国", "哈姆": "德国",
            "维尔茨堡": "德国", "雷克林豪森": "德国", "沃尔夫斯堡": "德国", "格廷根": "德国", "科特布斯": "德国",
            "希尔德斯海姆": "德国", "埃朗根": "德国", "特里尔": "德国", "耶拿": "德国", "康斯坦茨": "德国",
            "新天鹅堡": "德国", "罗滕堡": "德国", "科布伦茨": "德国", "班贝格": "德国", "拜罗伊特": "德国",
            
            # 英国
            "伦敦": "英国", "伯明翰": "英国", "曼彻斯特": "英国", "格拉斯哥": "英国", "利物浦": "英国",
            "利兹": "英国", "谢菲尔德": "英国", "爱丁堡": "英国", "布里斯托": "英国", "莱斯特": "英国",
            "考文垂": "英国", "布拉德福德": "英国", "贝尔法斯特": "英国", "卡迪夫": "英国", "诺丁汉": "英国",
            "金斯顿": "英国", "纽卡斯尔": "英国", "普利茅斯": "英国", "斯托克": "英国", "南安普顿": "英国",
            "雷丁": "英国", "德比": "英国", "约克": "英国", "牛津": "英国", "剑桥": "英国",
            "巴斯": "英国", "温莎": "英国", "坎特伯雷": "英国", "斯特拉特福": "英国", "湖区": "英国",
            "斯凯岛": "英国", "爱丁堡": "英国", "格拉斯哥": "英国", "史德灵": "英国", "珀斯": "英国",
            "因弗内斯": "英国", "阿伯丁": "英国", "邓迪": "英国", "法夫": "英国", "奥班": "英国",
            
            # 荷兰
            "阿姆斯特丹": "荷兰", "鹿特丹": "荷兰", "海牙": "荷兰", "乌得勒支": "荷兰", "埃因霍温": "荷兰",
            "蒂尔堡": "荷兰", "格罗宁根": "荷兰", "阿尔梅勒": "荷兰", "布雷达": "荷兰", "奈梅亨": "荷兰",
            "阿珀尔多伦": "荷兰", "哈勒姆": "荷兰", "阿纳姆": "荷兰", "恩斯赫德": "荷兰", "阿默斯福特": "荷兰",
            "赞丹": "荷兰", "海牙": "荷兰", "阿尔克马尔": "荷兰", "马斯特里赫特": "荷兰", "莱顿": "荷兰",
            "代尔夫特": "荷兰", "多德雷赫特": "荷兰", "豪达": "荷兰", "羊角村": "荷兰", "马尔肯": "荷兰",
            
            # 比利时
            "布鲁塞尔": "比利时", "安特卫普": "比利时", "根特": "比利时", "沙勒罗瓦": "比利时", "列日": "比利时",
            "布吕赫": "比利时", "那慕尔": "比利时", "蒙斯": "比利时", "阿尔斯特": "比利时", "科特赖克": "比利时",
            "哈瑟尔特": "比利时", "圣尼古拉": "比利时", "奥斯坦德": "比利时", "梅赫伦": "比利时", "鲁汶": "比利时",
            
            # 卢森堡
            "卢森堡市": "卢森堡", "埃施": "卢森堡", "迪费当日": "卢森堡", "杜德朗日": "卢森堡",
            
            # === 南欧 ===
            # 意大利
            "罗马": "意大利", "米兰": "意大利", "威尼斯": "意大利", "佛罗伦萨": "意大利", "那不勒斯": "意大利",
            "都灵": "意大利", "帕勒莫": "意大利", "热那亚": "意大利", "博洛尼亚": "意大利", "巴里": "意大利",
            "卡塔尼亚": "意大利", "佛罗伦萨": "意大利", "韦罗纳": "意大利", "威尼斯": "意大利", "墨西拿": "意大利",
            "帕多瓦": "意大利", "的里雅斯特": "意大利", "塔兰托": "意大利", "布雷西亚": "意大利", "摩德纳": "意大利",
            "雷焦卡拉布里亚": "意大利", "普拉托": "意大利", "卡利亚里": "意大利", "帕尔马": "意大利", "佩鲁贾": "意大利",
            "利沃诺": "意大利", "雷焦艾米利亚": "意大利", "佛嘉": "意大利", "萨莱诺": "意大利", "拉温纳": "意大利",
            "里米尼": "意大利", "拉斯佩齐亚": "意大利", "萨萨里": "意大利", "蒙扎": "意大利", "贝加莫": "意大利",
            "比萨": "意大利", "维琴察": "意大利", "三月十五日": "意大利", "博尔扎诺": "意大利", "安德里亚": "意大利",
            "阿雷佐": "意大利", "蒂沃利": "意大利", "阿西西": "意大利", "锡耶纳": "意大利", "五渔村": "意大利",
            "马泰拉": "意大利", "庞贝": "意大利", "卡普里岛": "意大利", "阿马尔菲": "意大利", "科莫": "意大利",
            
            # 西班牙
            "马德里": "西班牙", "巴塞罗那": "西班牙", "瓦伦西亚": "西班牙", "塞维利亚": "西班牙", "萨拉戈萨": "西班牙",
            "马拉加": "西班牙", "穆尔西亚": "西班牙", "帕尔马": "西班牙", "拉斯帕尔马斯": "西班牙", "毕尔巴鄂": "西班牙",
            "阿利坎特": "西班牙", "科尔多瓦": "西班牙", "巴利亚多利德": "西班牙", "维戈": "西班牙", "希洪": "西班牙",
            "莱昂": "西班牙", "拉科鲁尼亚": "西班牙", "埃尔切": "西班牙", "奥维耶多": "西班牙", "圣塞巴斯蒂安": "西班牙",
            "桑坦德": "西班牙", "卡斯特利翁": "西班牙", "洛格罗尼奥": "西班牙", "巴达霍斯": "西班牙", "萨拉曼卡": "西班牙",
            "韦尔瓦": "西班牙", "阿尔梅里亚": "西班牙", "卡迪斯": "西班牙", "格拉纳达": "西班牙", "托莱多": "西班牙",
            "昆卡": "西班牙", "卡塞雷斯": "西班牙", "塞哥维亚": "西班牙", "阿维拉": "西班牙", "布尔戈斯": "西班牙",
            "马略卡岛": "西班牙", "伊比萨": "西班牙", "特内里费": "西班牙", "大加那利": "西班牙", "兰萨罗特": "西班牙",
            
            # 葡萄牙
            "里斯本": "葡萄牙", "波尔图": "葡萄牙", "阿马多拉": "葡萄牙", "布拉加": "葡萄牙", "塞图巴尔": "葡萄牙",
            "科英布拉": "葡萄牙", "丰沙尔": "葡萄牙", "阿威罗": "葡萄牙", "埃武拉": "葡萄牙", "法鲁": "葡萄牙",
            "阿尔布费拉": "葡萄牙", "辛特拉": "葡萄牙", "卡斯凯什": "葡萄牙", "奥比杜什": "葡萄牙", "波尔塔莱格雷": "葡萄牙",
            "吉马良斯": "葡萄牙", "维亚纳堡": "葡萄牙", "维塞乌": "葡萄牙", "拉戈什": "葡萄牙", "萨格里什": "葡萄牙",
            
            # 希腊
            "雅典": "希腊", "塞萨洛尼基": "希腊", "帕特雷": "希腊", "伊拉克利翁": "希腊", "拉里萨": "希腊",
            "沃洛斯": "希腊", "约阿尼纳": "希腊", "卡瓦拉": "希腊", "哈尼亚": "希腊", "塞雷斯": "希腊",
            "圣托里尼": "希腊", "米科诺斯": "希腊", "罗德岛": "希腊", "科孚": "希腊", "克里特": "希腊",
            "帕罗斯": "希腊", "纳克索斯": "希腊", "扎金索斯": "希腊", "凯法利尼亚": "希腊", "斯基亚索斯": "希腊",
            "德尔菲": "希腊", "奥林匹亚": "希腊", "迈锡尼": "希腊", "埃皮达鲁斯": "希腊", "梅泰奥拉": "希腊",
            
            # === 中欧 ===
            # 奥地利
            "维也纳": "奥地利", "格拉茨": "奥地利", "林茨": "奥地利", "萨尔茨堡": "奥地利", "因斯布鲁克": "奥地利",
            "克拉根福": "奥地利", "菲拉赫": "奥地利", "韦尔斯": "奥地利", "圣珀尔滕": "奥地利", "多恩比恩": "奥地利",
            "维也纳新城": "奥地利", "施泰尔": "奥地利", "费尔德基兴": "奥地利", "布鲁克": "奥地利", "莱奥本": "奥地利",
            "哈尔施塔特": "奥地利", "巴德伊舍尔": "奥地利", "梅尔克": "奥地利", "瓦绍": "奥地利", "库夫斯坦": "奥地利",
            
            # 捷克
            "布拉格": "捷克", "布尔诺": "捷克", "俄斯特拉发": "捷克", "比尔森": "捷克", "奥洛穆茨": "捷克",
            "利贝雷茨": "捷克", "赫拉德茨克拉洛韦": "捷克", "乌斯季": "捷克", "帕尔杜比采": "捷克", "兹林": "捷克",
            "哈维若夫": "捷克", "克拉德诺": "捷克", "切斯凯布杰约维采": "捷克", "莫斯特": "捷克", "卡尔维纳": "捷克",
            "库特纳霍拉": "捷克", "泰尔奇": "捷克", "克鲁姆洛夫": "捷克", "卡尔什特因": "捷克", "布拉格城堡": "捷克",
            
            # 匈牙利
            "布达佩斯": "匈牙利", "德布勒森": "匈牙利", "塞格德": "匈牙利", "米什科尔茨": "匈牙利", "佩奇": "匈牙利",
            "焦尔": "匈牙利", "尼赖吉哈佐": "匈牙利", "凯奇凯梅特": "匈牙利", "塞克什白堡": "匈牙利", "松博特海伊": "匈牙利",
            "松博特海伊": "匈牙利", "维斯普雷姆": "匈牙利", "埃格尔": "匈牙利", "贝凯什乔包": "匈牙利", "大沃拉丁": "匈牙利",
            "埃斯泰尔戈姆": "匈牙利", "维谢格拉德": "匈牙利", "霍洛克": "匈牙利", "蒂豪尼": "匈牙利", "巴拉顿湖": "匈牙利",
            
            # 波兰
            "华沙": "波兰", "克拉科夫": "波兰", "罗兹": "波兰", "弗罗茨瓦夫": "波兰", "波兹南": "波兰",
            "格但斯克": "波兰", "什切青": "波兰", "比得哥什": "波兰", "卢布林": "波兰", "卡托维兹": "波兰",
            "白雅斯托克": "波兰", "格丁尼亚": "波兰", "琴斯托霍瓦": "波兰", "拉多姆": "波兰", "索斯诺维茨": "波兰",
            "托伦": "波兰", "基尔采": "波兰", "格利维采": "波兰", "扎布热": "波兰", "比托姆": "波兰",
            "奥斯威辛": "波兰", "马尔堡": "波兰", "扎科帕内": "波兰", "维利奇卡": "波兰", "弗罗茨瓦夫": "波兰",
            
            # 斯洛伐克
            "布拉迪斯拉发": "斯洛伐克", "科希策": "斯洛伐克", "普雷绍夫": "斯洛伐克", "日利纳": "斯洛伐克", "班斯卡比斯特里察": "斯洛伐克",
            "尼特拉": "斯洛伐克", "特伦钦": "斯洛伐克", "马丁": "斯洛伐克", "特尔纳瓦": "斯洛伐克", "波普拉德": "斯洛伐克",
            "普里维德扎": "斯洛伐克", "兹沃伦": "斯洛伐克", "巴尔代约夫": "斯洛伐克", "列沃恰": "斯洛伐克", "斯皮什斯基堡": "斯洛伐克",
            
            # 斯洛文尼亚
            "卢布尔雅那": "斯洛文尼亚", "马里博尔": "斯洛文尼亚", "采列": "斯洛文尼亚", "克拉尼": "斯洛文尼亚", "韦莱涅": "斯洛文尼亚",
            "新戈里察": "斯洛文尼亚", "科佩尔": "斯洛文尼亚", "诺沃梅斯托": "斯洛文尼亚", "卡姆尼克": "斯洛文尼亚", "多姆扎勒": "斯洛文尼亚",
            "布莱德": "斯洛文尼亚", "博希尼": "斯洛文尼亚", "皮兰": "斯洛文尼亚", "什科茨扬": "斯洛文尼亚", "波斯托伊纳": "斯洛文尼亚",
            
            # 瑞士
            "苏黎世": "瑞士", "日内瓦": "瑞士", "巴塞尔": "瑞士", "伯尔尼": "瑞士", "洛桑": "瑞士",
            "圣加仑": "瑞士", "卢塞恩": "瑞士", "卢加诺": "瑞士", "比尔": "瑞士", "图恩": "瑞士",
            "拉绍德封": "瑞士", "沙夫豪森": "瑞士", "弗里堡": "瑞士", "韦维": "瑞士", "拉佩斯": "瑞士",
            "因特拉肯": "瑞士", "采尔马特": "瑞士", "格林德瓦": "瑞士", "少女峰": "瑞士", "马特洪峰": "瑞士",
            "圣莫里茨": "瑞士", "洛伊克巴德": "瑞士", "安德马特": "瑞士", "文根": "瑞士", "拉克斯": "瑞士",
            
            # === 北欧 ===
            # 瑞典
            "斯德哥尔摩": "瑞典", "哥德堡": "瑞典", "马尔默": "瑞典", "乌普萨拉": "瑞典", "林雪平": "瑞典",
            "韦斯特罗斯": "瑞典", "厄勒布鲁": "瑞典", "北雪平": "瑞典", "赫尔辛堡": "瑞典", "永雪平": "瑞典",
            "松兹瓦尔": "瑞典", "于默奥": "瑞典", "韦克舍": "瑞典", "加夫勒": "瑞典", "博罗斯": "瑞典",
            "法伦": "瑞典", "卡尔斯塔德": "瑞典", "卡尔马": "瑞典", "维斯比": "瑞典", "基律纳": "瑞典",
            
            # 挪威
            "奥斯陆": "挪威", "卑尔根": "挪威", "特隆赫姆": "挪威", "斯塔万格": "斯洛文尼亚", "克里斯蒂安桑": "挪威",
            "腓特烈斯塔": "挪威", "德拉门": "挪威", "谢恩": "挪威", "桑内斯": "挪威", "萨尔普斯堡": "挪威",
            "特洛姆瑟": "挪威", "博多": "挪威", "阿尔塔": "挪威", "哈默菲斯特": "挪威", "纳尔维克": "挪威",
            "弗洛姆": "挪威", "盖朗厄尔": "挪威", "奥勒松": "挪威", "利勒哈默尔": "挪威", "罗弗敦群岛": "挪威",
            
            # 丹麦
            "哥本哈根": "丹麦", "奥胡斯": "丹麦", "欧登塞": "丹麦", "奥尔堡": "丹麦", "埃斯比约": "丹麦",
            "兰德斯": "丹麦", "科尔丁": "丹麦", "赫尔辛格": "丹麦", "马里布": "丹麦", "海勒鲁普": "丹麦",
            "比隆": "丹麦", "希勒勒": "丹麦", "罗斯基勒": "丹麦", "斯卡恩": "丹麦", "法尔瑟特": "丹麦",
            
            # 芬兰
            "赫尔辛基": "芬兰", "埃斯波": "芬兰", "坦佩雷": "芬兰", "万塔": "芬兰", "图尔库": "芬兰",
            "奥卢": "芬兰", "拉赫蒂": "芬兰", "库奥皮奥": "芬兰", "约恩苏": "芬兰", "约瓦斯屈莱": "芬兰",
            "拉彭兰塔": "芬兰", "科特卡": "芬兰", "瓦萨": "芬兰", "弗绍": "芬兰", "海门林纳": "芬兰",
            "罗瓦涅米": "芬兰", "凯米": "芬兰", "托尔尼奥": "芬兰", "萨利色尔卡": "芬兰", "伊瓦洛": "芬兰",
            
            # 冰岛
            "雷克雅未克": "冰岛", "科帕沃古尔": "冰岛", "哈夫纳夫约杜尔": "冰岛", "阿克雷里": "冰岛", "雷克雅内斯": "冰岛",
            "塞尔福斯": "冰岛", "韦斯特曼纳群岛": "冰岛", "胡萨维克": "冰岛", "埃伊尔斯塔济": "冰岛", "凯夫拉维克": "冰岛",
            
            # === 东欧 ===
            # 俄罗斯(欧洲部分)
            "莫斯科": "俄罗斯", "圣彼得堡": "俄罗斯", "下诺夫哥罗德": "俄罗斯", "喀山": "俄罗斯", "萨马拉": "俄罗斯",
            "伏尔加格勒": "俄罗斯", "罗斯托夫": "俄罗斯", "乌法": "俄罗斯", "彭萨": "俄罗斯", "雅罗斯拉夫": "俄罗斯",
            "卡卢加": "俄罗斯", "图拉": "俄罗斯", "弗拉基米尔": "俄罗斯", "苏兹达尔": "俄罗斯", "谢尔盖夫": "俄罗斯",
            
            # 乌克兰
            "基辅": "乌克兰", "哈尔科夫": "乌克兰", "敖德萨": "乌克兰", "第聂伯": "乌克兰", "顿涅茨克": "乌克兰",
            "扎波罗热": "乌克兰", "利沃夫": "乌克兰", "克里沃罗格": "乌克兰", "尼古拉耶夫": "乌克兰", "马里乌波尔": "乌克兰",
            "卢甘斯克": "乌克兰", "文尼察": "乌克兰", "赫尔松": "乌克兰", "切尔卡瑟": "乌克兰", "切尔尼戈夫": "乌克兰",
            
            # 白俄罗斯
            "明斯克": "白俄罗斯", "戈梅利": "白俄罗斯", "莫吉廖夫": "白俄罗斯", "维帖布斯克": "白俄罗斯", "格罗德诺": "白俄罗斯",
            "布列斯特": "白俄罗斯", "鲍里索夫": "白俄罗斯", "巴拉诺维奇": "白俄罗斯", "平斯克": "白俄罗斯", "奥尔沙": "白俄罗斯",
            
            # 波罗的海三国
            "里加": "拉脱维亚", "陶格夫匹尔斯": "拉脱维亚", "利耶帕亚": "拉脱维亚", "叶尔加瓦": "拉脱维亚", "文茨皮尔斯": "拉脱维亚",
            "塔林": "爱沙尼亚", "塔尔图": "爱沙尼亚", "纳尔瓦": "爱沙尼亚", "帕尔努": "爱沙尼亚", "科赫特拉": "爱沙尼亚",
            "维尔纽斯": "立陶宛", "考纳斯": "立陶宛", "克莱佩达": "立陶宛", "希奥利艾": "立陶宛", "帕内韦日斯": "立陶宛",
            
            # 摩尔多瓦
            "基希讷乌": "摩尔多瓦", "蒂拉斯波尔": "摩尔多瓦", "巴尔济": "摩尔多瓦", "本德尔": "摩尔多瓦", "雷布尼察": "摩尔多瓦",
            
            # === 巴尔干半岛 ===
            # 克罗地亚
            "萨格勒布": "克罗地亚", "斯普利特": "克罗地亚", "里耶卡": "克罗地亚", "奥西耶克": "克罗地亚", "扎达尔": "克罗地亚",
            "普拉": "克罗地亚", "杜布罗夫尼克": "克罗地亚", "希贝尼克": "克罗地亚", "卡尔洛瓦茨": "克罗地亚", "瓦拉日丁": "克罗地亚",
            "罗维尼": "克罗地亚", "波雷奇": "克罗地亚", "特罗吉尔": "克罗地亚", "赫瓦尔": "克罗地亚", "科尔丘拉": "克罗地亚",
            
            # 塞尔维亚
            "贝尔格莱德": "塞尔维亚", "诺维萨德": "塞尔维亚", "尼什": "塞尔维亚", "克拉古耶瓦茨": "塞尔维亚", "苏博蒂察": "塞尔维亚",
            "潘切沃": "塞尔维亚", "泽蒙": "塞尔维亚", "莱斯科瓦茨": "塞尔维亚", "恰恰克": "塞尔维亚", "新帕扎尔": "塞尔维亚",
            
            # 波黑
            "萨拉热窝": "波黑", "巴尼亚卢卡": "波黑", "图兹拉": "波黑", "泽尼察": "波黑", "莫斯塔尔": "波黑",
            "比哈奇": "波黑", "布里耶利纳": "波黑", "多博伊": "波黑", "格拉迪什卡": "波黑", "利夫诺": "波黑",
            
            # 黑山
            "波德戈里察": "黑山", "尼克希奇": "黑山", "普里耶波列": "黑山", "比耶洛波列": "黑山", "采蒂涅": "黑山",
            "布德瓦": "黑山", "科托尔": "黑山", "乌尔齐尼": "黑山", "赫尔采格诺维": "黑山", "巴尔": "黑山",
            
            # 北马其顿
            "斯科普里": "北马其顿", "库马诺沃": "北马其顿", "比托拉": "北马其顿", "普里莱普": "北马其顿", "特托沃": "北马其顿",
            "韦莱斯": "北马其顿", "什蒂普": "北马其顿", "奥赫里德": "北马其顿", "戈斯蒂瓦尔": "北马其顿", "斯特鲁加": "北马其顿",
            
            # 阿尔巴尼亚
            "地拉那": "阿尔巴尼亚", "都拉斯": "阿尔巴尼亚", "埃尔巴桑": "阿尔巴尼亚", "发罗拉": "阿尔巴尼亚", "斯库台": "阿尔巴尼亚",
            "科尔察": "阿尔巴尼亚", "卢什涅": "阿尔巴尼亚", "费里": "阿尔巴尼亚", "贝拉特": "阿尔巴尼亚", "吉诺卡斯特": "阿尔巴尼亚",
            
            # 保加利亚
            "索菲亚": "保加利亚", "普罗夫迪夫": "保加利亚", "瓦尔纳": "保加利亚", "布尔加斯": "保加利亚", "鲁塞": "保加利亚",
            "斯塔拉扎戈拉": "保加利亚", "普列文": "保加利亚", "슬리문": "保加利亚", "多布里奇": "保加利亚", "舒门": "保加利亚",
            "帕扎尔吉克": "保加利亚", "哈斯科沃": "保加利亚", "扬博尔": "保加利亚", "布拉戈耶夫格勒": "保加利亚", "韦利科特尔诺沃": "保加利亚",
            
            # 罗马尼亚
            "布加勒斯特": "罗马尼亚", "克卢日": "罗马尼亚", "蒂米什瓦拉": "罗马尼亚", "雅西": "罗马尼亚", "康斯坦察": "罗马尼亚",
            "克拉约瓦": "罗马尼亚", "布拉索夫": "罗马尼亚", "加拉茨": "罗马尼亚", "普洛耶什蒂": "罗马尼亚", "奥拉迪亚": "罗马尼亚",
            "布勒伊拉": "罗马尼亚", "阿拉德": "罗马尼亚", "皮特什蒂": "罗马尼亚", "锡比乌": "罗马尼亚", "巴克乌": "罗马尼亚",
            "锡纳亚": "罗马尼亚", "布兰": "罗马尼亚", "德古拉城堡": "罗马尼亚", "佩莱什城堡": "罗马尼亚", "马拉穆雷什": "罗马尼亚",
            
            # 土耳其(欧洲部分)
            "伊斯坦布尔": "土耳其", "埃迪尔内": "土耳其", "泰基尔达": "土耳其", "克尔克拉雷利": "土耳其", "恰纳卡莱": "土耳其",
            
            # 塞浦路斯
            "尼科西亚": "塞浦路斯", "利马索尔": "塞浦路斯", "拉纳卡": "塞浦路斯", "法马古斯塔": "塞浦路斯", "帕福斯": "塞浦路斯",
            "凯里尼亚": "塞浦路斯", "阿依纳帕": "塞浦路斯", "普罗塔拉斯": "塞浦路斯", "特罗多斯": "塞浦路斯", "阿卡马斯": "塞浦路斯",
            
            # 马耳他
            "瓦莱塔": "马耳他", "斯利马": "马耳他", "圣朱利安斯": "马耳他", "姆西达": "马耳他", "维多利亚": "马耳他",
            "马尔萨什洛克": "马耳他", "梅利哈": "马耳他", "戈佐": "马耳他", "蓝湖": "马耳他", "姆迪纳": "马耳他",
        }
        
        # 欧洲城市别名映射(包含各种表达方式)
        self.european_city_aliases = {
            # 英文名称映射
            "paris": "巴黎", "rome": "罗马", "london": "伦敦", "berlin": "柏林", 
            "madrid": "马德里", "barcelona": "巴塞罗那", "vienna": "维也纳", "prague": "布拉格",
            "amsterdam": "阿姆斯特丹", "florence": "佛罗伦萨", "venice": "威尼斯", "athens": "雅典",
            "budapest": "布达佩斯", "lisbon": "里斯本", "stockholm": "斯德哥尔摩", "copenhagen": "哥本哈根",
            "helsinki": "赫尔辛基", "oslo": "奥斯陆", "zurich": "苏黎世", "geneva": "日内瓦",
            "munich": "慕尼黑", "milan": "米兰", "naples": "那不勒斯", "nice": "尼斯",
            "edinburgh": "爱丁堡", "dublin": "都柏林", "brussels": "布鲁塞尔", "warsaw": "华沙",
            "krakow": "克拉科夫", "zagreb": "萨格勒布", "belgrade": "贝尔格莱德", "sofia": "索菲亚",
            "bucharest": "布加勒斯特", "kiev": "基辅", "moscow": "莫斯科", "st petersburg": "圣彼得堡",
            "reykjavik": "雷克雅未克", "tallinn": "塔林", "riga": "里加", "vilnius": "维尔纽斯",
            "bratislava": "布拉迪斯拉发", "ljubljana": "卢布尔雅那", "sarajevo": "萨拉热窝",
            "dubrovnik": "杜布罗夫尼克", "split": "斯普利特", "santorini": "圣托里尼", "mykonos": "米科诺斯",
            
            # 中文别名
            "花都": "巴黎", "光之城": "巴黎", "永恒之城": "罗马", "雾都": "伦敦",
            "音乐之都": "维也纳", "黄金城市": "布拉格", "千塔之城": "布拉格",
            "运河之城": "阿姆斯特丹", "翡冷翠": "佛罗伦萨", "文艺复兴之都": "佛罗伦萨",
            "水城": "威尼斯", "西方文明的摇篮": "雅典", "多瑙河明珠": "布达佩斯",
            "七丘之城": "里斯本", "北方威尼斯": "斯德哥尔摩", "童话之都": "哥本哈根",
            "波罗的海的女儿": "赫尔辛基", "欧洲屋脊": "因特拉肯", "北方雅典": "爱丁堡",
            "翡翠岛": "都柏林", "欧洲之都": "布鲁塞尔", "高迪之城": "巴塞罗那",
        }
        
        self.chinese_numbers = {
            '一': 1, '二': 2, '三': 3, '四': 4, '五': 5, '六': 6, '七': 7, '八': 8, '九': 9, '十': 10,
            '两': 2, '半': 0.5, '壹': 1, '贰': 2, '叁': 3, '肆': 4, '伍': 5, '陆': 6, '柒': 7, '捌': 8, '玖': 9, '拾': 10,
            # 英文数字
            'one': 1, 'two': 2, 'three': 3, 'four': 4, 'five': 5, 'six': 6, 'seven': 7, 'eight': 8, 'nine': 9, 'ten': 10,
            'eleven': 11, 'twelve': 12, 'thirteen': 13, 'fourteen': 14, 'fifteen': 15,
            # 特殊时长表达
            '半个月': 15, '一个月': 30, '半年': 180, '一年': 365,
            '半天': 0.5, '一天': 1, '两天': 2, '三天': 3, '四天': 4, '五天': 5, '六天': 6, '七天': 7,
            '八天': 8, '九天': 9, '十天': 10, '半周': 3.5, '一周': 7, '两周': 14,
            # 假期相关
            '小长假': 3, '长假': 7, '十一': 7, '国庆': 7, '春节': 7, '五一': 3, '清明': 3,
            '端午': 3, '中秋': 3, '元旦': 3, '暑假': 60, '寒假': 30, '周末': 2, '长周末': 3,
            # 英文假期
            'weekend': 2, 'week': 7, 'month': 30, 'vacation': 7, 'holiday': 3
        }

    def extract(self, user_message: str,existing_info: dict = None) -> dict:
        
        # 输入验证
        if not user_message or not isinstance(user_message, str):
            log.warning("⚠️ 收到无效的用户消息")
            return existing_info or {}
        
        if len(user_message.strip()) < 2:
            log.warning("⚠️ 用户消息过短,跳过信息提取")
            return existing_info or {}
        
        if existing_info:
            log.info(f"接收到上下文信息,将在此基础上更新: {existing_info}")
            result = copy.deepcopy(existing_info)
        else:
            result = {}

        log.info(f"🛠️ 使用分词策略提取信息:'{user_message[:50]}...'")
        
        # 1. 智能分词
        tokens = self._tokenize_message(user_message)
        log.info(f"📝 分词结果:{tokens}")
        
        # 2. 基于分词进行信息提取
        newly_extracted_info = {}
        
        # 提取目的地信息
        destination_info = self._extract_destination_from_tokens(tokens)
        if destination_info:
            newly_extracted_info["destination"] = destination_info
        
        # 提取时长信息
        duration_info = self._extract_duration_from_tokens(tokens)
        if duration_info:
            newly_extracted_info["duration"] = duration_info
        
        # 提取预算信息
        budget_info = self._extract_budget_from_tokens(tokens)
        if budget_info:
            newly_extracted_info["budget"] = budget_info
        
        log.info(f"📊 分词提取结果: {newly_extracted_info}")
        return newly_extracted_info
    
    def _merge_info(self, new_info: dict, existing_info: dict) -> dict:

        for key, value in new_info.items():
            # 如果新旧信息中同一个键的值都是字典,则递归深入合并
            if isinstance(value, dict) and key in existing_info and isinstance(existing_info[key], dict):
                self._merge_info(value, existing_info[key])
            else:
                # 否则,直接用新信息覆盖或添加
                existing_info[key] = value
        return existing_info


    def _tokenize_message(self, text: str) -> list:
        """智能分词,支持中英文混合"""
        
        # 预处理:统一标点符号和空格
        text = text.replace(',', ',').replace('。', '.').replace('!', '!').replace('?', '?')
        text = text.replace('(', '(').replace(')', ')').replace('【', '[').replace('】', ']')
        
        tokens = []
        current_token = ""
        i = 0
        
        while i < len(text):
            char = text[i]
            
            # 处理空格和标点符号
            if char in ' ,,.。!!??()()[]【】::;;':
                if current_token:
                    tokens.append(current_token)
                    current_token = ""
                if char.strip():  # 保留非空格的标点符号
                    tokens.append(char)
                i += 1
                continue
            
            # 处理数字(包括小数和货币符号)
            if char.isdigit() or char in '¥$€£₩':
                if current_token and not (current_token[-1].isdigit() or current_token[-1] in '¥$€£₩.'):
                    tokens.append(current_token)
                    current_token = char
                else:
                    current_token += char
                
                # 继续读取数字部分
                i += 1
                while i < len(text) and (text[i].isdigit() or text[i] in '.,'):
                    current_token += text[i]
                    i += 1
                
                # 检查货币单位
                currency_units = ['元', '块', '钱', '欧', '美元', '英镑', '日元', '韩元', '瑞郎', 'rmb', 'usd', 'eur', 'gbp', 'jpy', 'krw', 'chf']
                remaining_text = text[i:].lower()
                for unit in currency_units:
                    if remaining_text.startswith(unit):
                        current_token += text[i:i+len(unit)]
                        i += len(unit)
                        break
                
                tokens.append(current_token)
                current_token = ""
                continue
            
            # 处理英文单词
            if char.isalpha() and ord(char) < 128:  # ASCII字符
                if current_token and not current_token[-1].isalpha():
                    tokens.append(current_token)
                    current_token = char
                else:
                    current_token += char
                
                # 继续读取英文字符
                i += 1
                while i < len(text) and text[i].isalpha() and ord(text[i]) < 128:
                    current_token += text[i]
                    i += 1
                
                tokens.append(current_token)
                current_token = ""
                continue
            
            # 处理中文字符
            if self._is_chinese_char(char):
                if current_token and not self._is_chinese_char(current_token[-1]):
                    tokens.append(current_token)
                    current_token = ""
                
                # 对于中文,我们需要智能分词
                # 检查是否是多字符城市名、时间表达等
                remaining_text = text[i:]
                
                # 尝试匹配城市名
                matched_city = self._match_city_name(remaining_text)
                if matched_city:
                    tokens.append(matched_city)
                    i += len(matched_city)
                    continue
                
                # 尝试匹配时间表达
                matched_time = self._match_time_expression(remaining_text)
                if matched_time:
                    tokens.append(matched_time)
                    i += len(matched_time)
                    continue
                
                # 尝试匹配预算类型关键词
                matched_budget_type = self._match_budget_type(remaining_text)
                if matched_budget_type:
                    tokens.append(matched_budget_type)
                    i += len(matched_budget_type)
                    continue
                
                # 尝试匹配常见词汇
                matched_word = self._match_common_word(remaining_text)
                if matched_word:
                    tokens.append(matched_word)
                    i += len(matched_word)
                    continue
                
                # 单个中文字符
                tokens.append(char)
                i += 1
            else:
                # 其他字符
                current_token += char
                i += 1
        
        # 处理最后的token
        if current_token:
            tokens.append(current_token)
        
        # 后处理:合并一些相关的tokens
        tokens = self._post_process_tokens(tokens)
        
        return [token for token in tokens if token.strip()]  # 过滤空token

    def _is_chinese_char(self, char: str) -> bool:
        """判断是否为中文字符"""
        return '\u4e00' <= char <= '\u9fff'

    def _match_city_name(self, text: str) -> str:
        """匹配城市名称"""
        # 按长度从长到短排序,优先匹配长的城市名
        all_cities = list(self.european_cities.keys()) + list(self.european_city_aliases.keys())
        all_cities = sorted(set(all_cities), key=len, reverse=True)
        
        for city in all_cities:
            if text.startswith(city):
                return city
        return ""

    def _match_time_expression(self, text: str) -> str:
        """匹配时间表达"""
        time_expressions = [
            # 多字符时间表达
            '半个月', '一个月', '两个月', '三个月', '半年', '一年',
            '小长假', '长周末', '国庆节', '春节假期', '暑假', '寒假',
            '一天半', '两天半', '三天半', '一周半', '两周',
            # 英文时间表达
            'one day', 'two days', 'three days', 'one week', 'two weeks',
            'long weekend', 'vacation', 'holiday', 'spring break'
        ]
        
        # 按长度排序,优先匹配长表达
        time_expressions = sorted(time_expressions, key=len, reverse=True)
        
        text_lower = text.lower()
        for expr in time_expressions:
            if text_lower.startswith(expr.lower()):
                return expr
            if text.startswith(expr):
                return expr
        return ""

    def _match_budget_type(self, text: str) -> str:
        """匹配预算类型关键词"""
        budget_keywords = [
            # 经济型
            '经济实惠', '省钱', '便宜', '实惠', '经济', '穷游', '背包客',
            '青年旅社', '学生', '预算有限', '性价比',
            # 舒适型
            '舒适', '中等', '适中', '标准', '普通', '中档', '合理',
            # 豪华型
            '豪华', '奢华', '高端', '顶级', '精品', '五星', '不差钱',
            '任性', '土豪', 'VIP', '贵族', '皇家'
        ]
        
        # 按长度排序
        budget_keywords = sorted(budget_keywords, key=len, reverse=True)
        
        for keyword in budget_keywords:
            if text.startswith(keyword):
                return keyword
        return ""

    def _match_common_word(self, text: str) -> str:
        """匹配常见词汇"""
        common_words = [
            # 旅行相关动词
            '想去', '计划去', '打算去', '准备去', '希望去', '考虑去',
            '前往', '旅行', '旅游', '游玩', '度假', '出发', '飞往',
            # 时间相关
            '三天', '四天', '五天', '六天', '七天', '八天', '九天', '十天',
            '一天', '两天', '几天', '多天', '数天',
            # 预算相关
            '预算', '花费', '费用', '成本', '开销', '支出', '消费',
            '总共', '一共', '大概', '约', '左右', '差不多',
            # 其他
            '行程', '计划', '安排', '路线', '攻略'
        ]
        
        # 按长度排序
        common_words = sorted(common_words, key=len, reverse=True)
        
        for word in common_words:
            if text.startswith(word):
                return word
        return ""

    def _post_process_tokens(self, tokens: list) -> list:
        """后处理tokens,合并相关的片段"""
        if not tokens:
            return tokens
        
        processed = []
        i = 0
        
        while i < len(tokens):
            current_token = tokens[i]
            
            # 合并数字+单位的组合
            if i < len(tokens) - 1:
                next_token = tokens[i + 1]
                
                # 数字 + 货币单位
                if (current_token.isdigit() and 
                    next_token.lower() in ['元', '块', '钱', '欧', '美元', '英镑', '日元', 'rmb', 'usd', 'eur', 'gbp', 'jpy']):
                    processed.append(current_token + next_token)
                    i += 2
                    continue
                
                # 数字 + 时间单位
                if (current_token.isdigit() and 
                    next_token in ['天', '日', '周', '月', '年', 'days', 'weeks', 'months']):
                    processed.append(current_token + next_token)
                    i += 2
                    continue
                
                # 预算 + 数字
                if current_token == '预算' and next_token.replace('.', '').replace(',', '').isdigit():
                    if i < len(tokens) - 2 and tokens[i + 2] in ['元', '块', '钱', '欧', 'rmb', 'usd', 'eur']:
                        processed.append(current_token + next_token + tokens[i + 2])
                        i += 3
                        continue
                    else:
                        processed.append(current_token + next_token)
                        i += 2
                        continue
            
            processed.append(current_token)
            i += 1
        
        return processed

    def _extract_destination_from_tokens(self, tokens: list) -> dict:
        """从tokens中提取目的地信息"""
        result = {}
        
        # 查找城市名
        for i, token in enumerate(tokens):
            # 直接匹配城市名
            city_name = self._normalize_city_name(token)
            if city_name:
                result["name"] = city_name
                if city_name in self.european_cities:
                    result["country"] = self.european_cities[city_name]
                break
            
            # 检查是否在动词后面
            if i > 0:
                prev_token = tokens[i - 1]
                if prev_token in ['去', '到', '想去', '前往', '旅行', '游', '玩', 'go', 'to', 'visit', 'travel']:
                    city_name = self._normalize_city_name(token)
                    if city_name:
                        result["name"] = city_name
                        if city_name in self.european_cities:
                            result["country"] = self.european_cities[city_name]
                        break
        
        # 如果没有找到,尝试fuzzy匹配
        if not result:
            for token in tokens:
                if len(token) >= 2:
                    # 模糊匹配城市名
                    for city, country in self.european_cities.items():
                        if token in city or city in token:
                            if len(token) >= len(city) * 0.6:  # 相似度阈值
                                result["name"] = city
                                result["country"] = country
                                break
                    if result:
                        break
        
        return result

    def _normalize_city_name(self, token: str) -> str:
        """标准化城市名称"""
        if not token:
            return ""
        
        token_lower = token.lower().strip()
        
        # 直接匹配
        if token in self.european_cities:
            return token
        
        # 别名匹配
        if token_lower in self.european_city_aliases:
            return self.european_city_aliases[token_lower]
        
        if token in self.european_city_aliases:
            return self.european_city_aliases[token]
        
        return ""

    def _extract_duration_from_tokens(self, tokens: list) -> dict:
        """从tokens中提取时长信息"""
        result = {}
        
        for i, token in enumerate(tokens):
            days = None
            description = ""
            
            # 处理 "数字+天" 的token
            if re.match(r'^\d+[天日]$', token):
                days = int(re.findall(r'\d+', token)[0])
            
            # 处理 "数字+weeks/days" 的token
            elif re.match(r'^\d+(days?|weeks?|months?)$', token.lower()):
                number = int(re.findall(r'\d+', token)[0])
                unit = re.findall(r'[a-zA-Z]+', token.lower())[0]
                if unit.startswith('day'):
                    days = number
                elif unit.startswith('week'):
                    days = number * 7
                elif unit.startswith('month'):
                    days = number * 30
            
            # 处理分离的数字和单位
            elif token.isdigit() and i < len(tokens) - 1:
                next_token = tokens[i + 1]
                number = int(token)
                
                if next_token in ['天', '日']:
                    days = number
                elif next_token in ['周', '星期', '礼拜', 'week', 'weeks']:
                    days = number * 7
                elif next_token in ['月', '个月', 'month', 'months']:
                    days = number * 30
            
            # 处理中文数字
            elif token in self.chinese_numbers:
                days = self.chinese_numbers[token]
                description = token
            
            # 处理特殊时长表达
            elif token in ['周末', 'weekend']:
                days = 2
                description = token
            elif token in ['长周末', 'long weekend']:
                days = 3
                description = token
            elif token in ['小长假', 'vacation', 'holiday']:
                days = 3
                description = token
            elif token in ['十一', '国庆', 'national day']:
                days = 7
                description = token
            elif token in ['春节', 'spring festival']:
                days = 7
                description = token
            elif token in ['暑假', 'summer vacation']:
                days = 60
                description = token
            elif token in ['寒假', 'winter vacation']:
                days = 30
                description = token
            
            # 处理复合表达 "三天两夜"
            elif re.match(r'^[一二三四五六七八九十\d]+天', token):
                # 提取数字部分
                for num_token in ['一', '二', '三', '四', '五', '六', '七', '八', '九', '十']:
                    if token.startswith(num_token):
                        days = self.chinese_numbers[num_token]
                        description = token
                        break
                if not days and token[0].isdigit():
                    days = int(token[0])
                    description = token
            
            # 验证天数合理性并设置结果
            if days and 0.5 <= days <= 365:
                result["days"] = int(days) if days >= 1 else days
                
                if not description:
                    # 添加描述信息
                    if days <= 1:
                        description = "当日往返"
                    elif days <= 3:
                        description = "短途旅行"
                    elif days <= 7:
                        description = "一周内旅行"
                    elif days <= 14:
                        description = "中长途旅行"
                    elif days <= 30:
                        description = "长途旅行"
                    else:
                        description = "超长途旅行"
                
                result["description"] = description
                break
        
        return result

    def _extract_budget_from_tokens(self, tokens: list) -> dict:
        """从tokens中提取预算信息"""
        result = {}
        
        # 1. 查找金额
        for i, token in enumerate(tokens):
            amount = None
            currency = "RMB"  # 默认货币
            
            # 处理包含货币的token "2000欧", "5000元"
            currency_patterns = [
                (r'(\d+(?:\.\d+)?)欧(?:元)?', 'EUR'),
                (r'(\d+(?:\.\d+)?)元', 'RMB'),
                (r'(\d+(?:\.\d+)?)块(?:钱)?', 'RMB'),
                (r'(\d+(?:\.\d+)?)人民币', 'RMB'),
                (r'(\d+(?:\.\d+)?)美元', 'USD'),
                (r'(\d+(?:\.\d+)?)英镑', 'GBP'),
                (r'(\d+(?:\.\d+)?)瑞(?:士)?法郎', 'CHF'),
                (r'(\d+(?:\.\d+)?)日元', 'JPY'),
                (r'(\d+(?:\.\d+)?)韩元', 'KRW'),
                (r'¥(\d+(?:\.\d+)?)', 'RMB'),
                (r'€(\d+(?:\.\d+)?)', 'EUR'),
                (r'\$(\d+(?:\.\d+)?)', 'USD'),
                (r'£(\d+(?:\.\d+)?)', 'GBP'),
                (r'(\d+(?:\.\d+)?)rmb', 'RMB'),
                (r'(\d+(?:\.\d+)?)usd', 'USD'),
                (r'(\d+(?:\.\d+)?)eur', 'EUR'),
                (r'(\d+(?:\.\d+)?)gbp', 'GBP'),
                (r'(\d+(?:\.\d+)?)chf', 'CHF'),
            ]
            
            for pattern, curr in currency_patterns:
                match = re.search(pattern, token.lower())
                if match:
                    amount = float(match.group(1))
                    currency = curr
                    break
            
            # 处理纯数字token(需要查看上下文)
            if not amount and re.match(r'^\d+(?:\.\d+)?$', token):
                number = float(token)
                
                # 检查前面的token是否有预算相关词汇
                budget_indicators = ['预算', '花费', '费用', '成本', '开销', '支出', '总共', '一共', 'budget', 'cost', 'spend']
                has_budget_context = False
                
                if i > 0 and tokens[i-1] in budget_indicators:
                    has_budget_context = True
                elif i > 1 and tokens[i-2] in budget_indicators:
                    has_budget_context = True
                
                # 检查后面是否有货币单位
                if i < len(tokens) - 1:
                    next_token = tokens[i + 1].lower()
                    currency_units = {
                        '元': 'RMB', '块': 'RMB', '钱': 'RMB', '人民币': 'RMB',
                        '欧': 'EUR', '欧元': 'EUR', '美元': 'USD', '英镑': 'GBP',
                        '瑞郎': 'CHF', '日元': 'JPY', '韩元': 'KRW',
                        'rmb': 'RMB', 'usd': 'USD', 'eur': 'EUR', 'gbp': 'GBP', 'chf': 'CHF'
                    }
                    
                    if next_token in currency_units:
                        amount = number
                        currency = currency_units[next_token]
                        has_budget_context = True
                
                # 如果有预算上下文但没有明确货币单位,根据数字大小推断
                if has_budget_context and not amount:
                    if number < 100:  # 可能是欧元或美元
                        # 查看是否有欧洲城市上下文
                        has_european_context = any(self._normalize_city_name(t) for t in tokens)
                        if has_european_context:
                            currency = 'EUR'
                        else:
                            currency = 'USD'
                    else:
                        currency = 'RMB'  # 大数字更可能是人民币
                    amount = number
            
            # 处理万、千等单位
            if amount:
                # 检查是否有万、千修饰符
                if i > 0:
                    prev_token = tokens[i-1]
                    if '万' in prev_token or 'w' in prev_token.lower():
                        amount *= 10000
                    elif '千' in prev_token or 'k' in prev_token.lower():
                        amount *= 1000
                elif i < len(tokens) - 1:
                    next_token = tokens[i+1]
                    if '万' in next_token or 'w' in next_token.lower():
                        amount *= 10000
                    elif '千' in next_token or 'k' in next_token.lower():
                        amount *= 1000
                
                if amount > 0:
                    result["amount"] = int(amount)
                    result["currency"] = currency
                    break
        
        # 2. 查找预算类型
        budget_type_keywords = {
            'economy': [
                '经济', '便宜', '省钱', '实惠', '节省', '穷游', '学生', '青年',
                '预算有限', '钱不多', '不贵', '划算', '性价比', '背包客',
                '简单', '基础', '低成本', '节约', 'budget', 'cheap', 'economy', 'affordable'
            ],
            'comfortable': [
                '舒适', '中等', '适中', '一般', '标准', '普通', '正常', '常规',
                '中档', '中级', '合理', '平均', '中间档次', 'comfortable', 'standard', 'moderate'
            ],
            'luxury': [
                '豪华', '奢华', '高端', '顶级', '精品', '奢侈', '贵族', '皇家',
                '贵一点', '不差钱', '任性', '土豪', '有钱', '五星', 'VIP',
                'luxury', 'premium', 'high-end', 'expensive', 'fancy'
            ]
        }
        
        for token in tokens:
            token_lower = token.lower()
            for budget_type, keywords in budget_type_keywords.items():
                if any(keyword in token_lower for keyword in keywords):
                    result["type"] = budget_type
                    
                    # 找到第一个匹配的关键词作为描述
                    for keyword in keywords:
                        if keyword in token_lower:
                            result["description"] = keyword if len(keyword) > 2 else token
                            break
                    break
            if result.get("type"):
                break
        
        # 3. 如果有金额但没有类型,根据金额推断类型
        if result.get("amount") and not result.get("type"):
            amount = result["amount"]
            currency = result.get("currency", "RMB")
            
            # 根据欧洲旅行成本设置阈值
            if currency == "EUR":
                if amount < 1500:  # 总预算
                    result["type"] = "economy"
                    result["description"] = "经济预算"
                elif amount < 4000:
                    result["type"] = "comfortable"
                    result["description"] = "舒适预算"
                else:
                    result["type"] = "luxury"
                    result["description"] = "豪华预算"
            elif currency == "USD":
                if amount < 2000:
                    result["type"] = "economy"
                    result["description"] = "经济预算"
                elif amount < 5000:
                    result["type"] = "comfortable"
                    result["description"] = "舒适预算"
                else:
                    result["type"] = "luxury"
                    result["description"] = "豪华预算"
            elif currency == "RMB":
                if amount < 8000:
                    result["type"] = "economy"
                    result["description"] = "经济预算"
                elif amount < 20000:
                    result["type"] = "comfortable"
                    result["description"] = "舒适预算"
                else:
                    result["type"] = "luxury"
                    result["description"] = "豪华预算"
        
        # 4. 处理中文数字金额
        chinese_money_mapping = {
            '一千': 1000, '两千': 2000, '三千': 3000, '四千': 4000, '五千': 5000,
            '六千': 6000, '七千': 7000, '八千': 8000, '九千': 9000,
            '一万': 10000, '两万': 20000, '三万': 30000, '四万': 40000, '五万': 50000
        }
        
        if not result.get("amount"):
            for token in tokens:
                if token in chinese_money_mapping:
                    result["amount"] = chinese_money_mapping[token]
                    result["currency"] = "RMB"
                    break
        
        return result

    # 保持向后兼容的验证方法
    def _validate_and_normalize(self, data: dict) -> dict:
        """验证和规范化数据"""
        return data