Spaces:
Sleeping
Sleeping
File size: 54,361 Bytes
d81440f b180c39 632df2f 3ecb35b e86b23a 632df2f 05b4419 3ecb35b 7324283 05b4419 3ecb35b 05b4419 3ecb35b 05b4419 7324283 e86b23a 502ec94 e86b23a 502ec94 05b4419 e86b23a 502ec94 e86b23a 3ecb35b 502ec94 3ecb35b e86b23a 502ec94 3ecb35b 05b4419 e86b23a 502ec94 3ecb35b 05b4419 e86b23a 502ec94 3ecb35b 05b4419 e86b23a 502ec94 e86b23a 05b4419 b713501 3ecb35b 502ec94 3ecb35b 05b4419 3ecb35b 05b4419 3ecb35b 05b4419 3ecb35b 05b4419 3ecb35b 502ec94 3ecb35b 05b4419 3ecb35b 05b4419 502ec94 05b4419 3ecb35b 05b4419 3ecb35b 502ec94 3ecb35b 05b4419 3ecb35b 05b4419 3ecb35b 05b4419 3ecb35b 05b4419 3ecb35b 05b4419 3ecb35b 05b4419 3ecb35b 05b4419 3ecb35b 05b4419 3ecb35b 05b4419 502ec94 05b4419 3ecb35b b713501 7f130dd b713501 c22dbae b713501 502ec94 b713501 502ec94 b713501 3ecb35b b713501 3ecb35b b713501 3ecb35b b713501 3ecb35b 240c11f 05b4419 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 |
import json
import re
from utils.logger import log
import jieba
from typing import List, Tuple
import copy
class InfoExtractor:
def __init__(self):
self.extraction_schema = {
"destination": {"type": dict, "fields": {"name": str, "country": str}},
"duration": {"type": dict, "fields": {"days": int, "description": str}},
"budget": {"type": dict, "fields": {"type": str, "amount": int, "currency": str, "description": str}}
}
# 欧洲城市和国家的完整映射关系(聚焦欧洲)
self.european_cities = {
# === 西欧 ===
# 法国
"巴黎": "法国", "里昂": "法国", "马赛": "法国", "尼斯": "法国", "戛纳": "法国",
"图卢兹": "法国", "南特": "法国", "斯特拉斯堡": "法国", "蒙彼利埃": "法国", "波尔多": "法国",
"里尔": "法国", "雷恩": "法国", "兰斯": "法国", "勒阿弗尔": "法国", "圣埃蒂安": "法国",
"土伦": "法国", "阿维尼翁": "法国", "凡尔赛": "法国", "枫丹白露": "法国", "第戎": "法国",
"昂热": "法国", "贝桑松": "法国", "佩皮尼昂": "法国", "卢尔德": "法国", "沙特尔": "法国",
# 德国
"柏林": "德国", "慕尼黑": "德国", "汉堡": "德国", "科隆": "德国", "法兰克福": "德国",
"斯图加特": "德国", "杜塞尔多夫": "德国", "多特蒙德": "德国", "埃森": "德国", "莱比锡": "德国",
"不来梅": "德国", "德累斯顿": "德国", "汉诺威": "德国", "纽伦堡": "德国", "杜伊斯堡": "德国",
"波鸿": "德国", "乌珀塔尔": "德国", "比勒费尔德": "德国", "波恩": "德国", "明斯特": "德国",
"卡尔斯鲁厄": "德国", "曼海姆": "德国", "奥格斯堡": "德国", "威斯巴登": "德国", "盖尔森基兴": "德国",
"门兴格拉德巴赫": "德国", "布伦瑞克": "德国", "基尔": "德国", "亚琛": "德国", "哈雷": "德国",
"马格德堡": "德国", "弗莱堡": "德国", "克里菲尔德": "德国", "吕贝克": "德国", "奥伯豪森": "德国",
"埃尔福特": "德国", "罗斯托克": "德国", "凯泽斯劳滕": "德国", "卡塞尔": "德国", "哈根": "德国",
"波茨坦": "德国", "萨尔布吕肯": "德国", "路德维希港": "德国", "奥尔登堡": "德国", "莱沃库森": "德国",
"奥斯纳布吕克": "德国", "索林根": "德国", "海德堡": "德国", "达姆施塔特": "德国", "哈姆": "德国",
"维尔茨堡": "德国", "雷克林豪森": "德国", "沃尔夫斯堡": "德国", "格廷根": "德国", "科特布斯": "德国",
"希尔德斯海姆": "德国", "埃朗根": "德国", "特里尔": "德国", "耶拿": "德国", "康斯坦茨": "德国",
"新天鹅堡": "德国", "罗滕堡": "德国", "科布伦茨": "德国", "班贝格": "德国", "拜罗伊特": "德国",
# 英国
"伦敦": "英国", "伯明翰": "英国", "曼彻斯特": "英国", "格拉斯哥": "英国", "利物浦": "英国",
"利兹": "英国", "谢菲尔德": "英国", "爱丁堡": "英国", "布里斯托": "英国", "莱斯特": "英国",
"考文垂": "英国", "布拉德福德": "英国", "贝尔法斯特": "英国", "卡迪夫": "英国", "诺丁汉": "英国",
"金斯顿": "英国", "纽卡斯尔": "英国", "普利茅斯": "英国", "斯托克": "英国", "南安普顿": "英国",
"雷丁": "英国", "德比": "英国", "约克": "英国", "牛津": "英国", "剑桥": "英国",
"巴斯": "英国", "温莎": "英国", "坎特伯雷": "英国", "斯特拉特福": "英国", "湖区": "英国",
"斯凯岛": "英国", "爱丁堡": "英国", "格拉斯哥": "英国", "史德灵": "英国", "珀斯": "英国",
"因弗内斯": "英国", "阿伯丁": "英国", "邓迪": "英国", "法夫": "英国", "奥班": "英国",
# 荷兰
"阿姆斯特丹": "荷兰", "鹿特丹": "荷兰", "海牙": "荷兰", "乌得勒支": "荷兰", "埃因霍温": "荷兰",
"蒂尔堡": "荷兰", "格罗宁根": "荷兰", "阿尔梅勒": "荷兰", "布雷达": "荷兰", "奈梅亨": "荷兰",
"阿珀尔多伦": "荷兰", "哈勒姆": "荷兰", "阿纳姆": "荷兰", "恩斯赫德": "荷兰", "阿默斯福特": "荷兰",
"赞丹": "荷兰", "海牙": "荷兰", "阿尔克马尔": "荷兰", "马斯特里赫特": "荷兰", "莱顿": "荷兰",
"代尔夫特": "荷兰", "多德雷赫特": "荷兰", "豪达": "荷兰", "羊角村": "荷兰", "马尔肯": "荷兰",
# 比利时
"布鲁塞尔": "比利时", "安特卫普": "比利时", "根特": "比利时", "沙勒罗瓦": "比利时", "列日": "比利时",
"布吕赫": "比利时", "那慕尔": "比利时", "蒙斯": "比利时", "阿尔斯特": "比利时", "科特赖克": "比利时",
"哈瑟尔特": "比利时", "圣尼古拉": "比利时", "奥斯坦德": "比利时", "梅赫伦": "比利时", "鲁汶": "比利时",
# 卢森堡
"卢森堡市": "卢森堡", "埃施": "卢森堡", "迪费当日": "卢森堡", "杜德朗日": "卢森堡",
# === 南欧 ===
# 意大利
"罗马": "意大利", "米兰": "意大利", "威尼斯": "意大利", "佛罗伦萨": "意大利", "那不勒斯": "意大利",
"都灵": "意大利", "帕勒莫": "意大利", "热那亚": "意大利", "博洛尼亚": "意大利", "巴里": "意大利",
"卡塔尼亚": "意大利", "佛罗伦萨": "意大利", "韦罗纳": "意大利", "威尼斯": "意大利", "墨西拿": "意大利",
"帕多瓦": "意大利", "的里雅斯特": "意大利", "塔兰托": "意大利", "布雷西亚": "意大利", "摩德纳": "意大利",
"雷焦卡拉布里亚": "意大利", "普拉托": "意大利", "卡利亚里": "意大利", "帕尔马": "意大利", "佩鲁贾": "意大利",
"利沃诺": "意大利", "雷焦艾米利亚": "意大利", "佛嘉": "意大利", "萨莱诺": "意大利", "拉温纳": "意大利",
"里米尼": "意大利", "拉斯佩齐亚": "意大利", "萨萨里": "意大利", "蒙扎": "意大利", "贝加莫": "意大利",
"比萨": "意大利", "维琴察": "意大利", "三月十五日": "意大利", "博尔扎诺": "意大利", "安德里亚": "意大利",
"阿雷佐": "意大利", "蒂沃利": "意大利", "阿西西": "意大利", "锡耶纳": "意大利", "五渔村": "意大利",
"马泰拉": "意大利", "庞贝": "意大利", "卡普里岛": "意大利", "阿马尔菲": "意大利", "科莫": "意大利",
# 西班牙
"马德里": "西班牙", "巴塞罗那": "西班牙", "瓦伦西亚": "西班牙", "塞维利亚": "西班牙", "萨拉戈萨": "西班牙",
"马拉加": "西班牙", "穆尔西亚": "西班牙", "帕尔马": "西班牙", "拉斯帕尔马斯": "西班牙", "毕尔巴鄂": "西班牙",
"阿利坎特": "西班牙", "科尔多瓦": "西班牙", "巴利亚多利德": "西班牙", "维戈": "西班牙", "希洪": "西班牙",
"莱昂": "西班牙", "拉科鲁尼亚": "西班牙", "埃尔切": "西班牙", "奥维耶多": "西班牙", "圣塞巴斯蒂安": "西班牙",
"桑坦德": "西班牙", "卡斯特利翁": "西班牙", "洛格罗尼奥": "西班牙", "巴达霍斯": "西班牙", "萨拉曼卡": "西班牙",
"韦尔瓦": "西班牙", "阿尔梅里亚": "西班牙", "卡迪斯": "西班牙", "格拉纳达": "西班牙", "托莱多": "西班牙",
"昆卡": "西班牙", "卡塞雷斯": "西班牙", "塞哥维亚": "西班牙", "阿维拉": "西班牙", "布尔戈斯": "西班牙",
"马略卡岛": "西班牙", "伊比萨": "西班牙", "特内里费": "西班牙", "大加那利": "西班牙", "兰萨罗特": "西班牙",
# 葡萄牙
"里斯本": "葡萄牙", "波尔图": "葡萄牙", "阿马多拉": "葡萄牙", "布拉加": "葡萄牙", "塞图巴尔": "葡萄牙",
"科英布拉": "葡萄牙", "丰沙尔": "葡萄牙", "阿威罗": "葡萄牙", "埃武拉": "葡萄牙", "法鲁": "葡萄牙",
"阿尔布费拉": "葡萄牙", "辛特拉": "葡萄牙", "卡斯凯什": "葡萄牙", "奥比杜什": "葡萄牙", "波尔塔莱格雷": "葡萄牙",
"吉马良斯": "葡萄牙", "维亚纳堡": "葡萄牙", "维塞乌": "葡萄牙", "拉戈什": "葡萄牙", "萨格里什": "葡萄牙",
# 希腊
"雅典": "希腊", "塞萨洛尼基": "希腊", "帕特雷": "希腊", "伊拉克利翁": "希腊", "拉里萨": "希腊",
"沃洛斯": "希腊", "约阿尼纳": "希腊", "卡瓦拉": "希腊", "哈尼亚": "希腊", "塞雷斯": "希腊",
"圣托里尼": "希腊", "米科诺斯": "希腊", "罗德岛": "希腊", "科孚": "希腊", "克里特": "希腊",
"帕罗斯": "希腊", "纳克索斯": "希腊", "扎金索斯": "希腊", "凯法利尼亚": "希腊", "斯基亚索斯": "希腊",
"德尔菲": "希腊", "奥林匹亚": "希腊", "迈锡尼": "希腊", "埃皮达鲁斯": "希腊", "梅泰奥拉": "希腊",
# === 中欧 ===
# 奥地利
"维也纳": "奥地利", "格拉茨": "奥地利", "林茨": "奥地利", "萨尔茨堡": "奥地利", "因斯布鲁克": "奥地利",
"克拉根福": "奥地利", "菲拉赫": "奥地利", "韦尔斯": "奥地利", "圣珀尔滕": "奥地利", "多恩比恩": "奥地利",
"维也纳新城": "奥地利", "施泰尔": "奥地利", "费尔德基兴": "奥地利", "布鲁克": "奥地利", "莱奥本": "奥地利",
"哈尔施塔特": "奥地利", "巴德伊舍尔": "奥地利", "梅尔克": "奥地利", "瓦绍": "奥地利", "库夫斯坦": "奥地利",
# 捷克
"布拉格": "捷克", "布尔诺": "捷克", "俄斯特拉发": "捷克", "比尔森": "捷克", "奥洛穆茨": "捷克",
"利贝雷茨": "捷克", "赫拉德茨克拉洛韦": "捷克", "乌斯季": "捷克", "帕尔杜比采": "捷克", "兹林": "捷克",
"哈维若夫": "捷克", "克拉德诺": "捷克", "切斯凯布杰约维采": "捷克", "莫斯特": "捷克", "卡尔维纳": "捷克",
"库特纳霍拉": "捷克", "泰尔奇": "捷克", "克鲁姆洛夫": "捷克", "卡尔什特因": "捷克", "布拉格城堡": "捷克",
# 匈牙利
"布达佩斯": "匈牙利", "德布勒森": "匈牙利", "塞格德": "匈牙利", "米什科尔茨": "匈牙利", "佩奇": "匈牙利",
"焦尔": "匈牙利", "尼赖吉哈佐": "匈牙利", "凯奇凯梅特": "匈牙利", "塞克什白堡": "匈牙利", "松博特海伊": "匈牙利",
"松博特海伊": "匈牙利", "维斯普雷姆": "匈牙利", "埃格尔": "匈牙利", "贝凯什乔包": "匈牙利", "大沃拉丁": "匈牙利",
"埃斯泰尔戈姆": "匈牙利", "维谢格拉德": "匈牙利", "霍洛克": "匈牙利", "蒂豪尼": "匈牙利", "巴拉顿湖": "匈牙利",
# 波兰
"华沙": "波兰", "克拉科夫": "波兰", "罗兹": "波兰", "弗罗茨瓦夫": "波兰", "波兹南": "波兰",
"格但斯克": "波兰", "什切青": "波兰", "比得哥什": "波兰", "卢布林": "波兰", "卡托维兹": "波兰",
"白雅斯托克": "波兰", "格丁尼亚": "波兰", "琴斯托霍瓦": "波兰", "拉多姆": "波兰", "索斯诺维茨": "波兰",
"托伦": "波兰", "基尔采": "波兰", "格利维采": "波兰", "扎布热": "波兰", "比托姆": "波兰",
"奥斯威辛": "波兰", "马尔堡": "波兰", "扎科帕内": "波兰", "维利奇卡": "波兰", "弗罗茨瓦夫": "波兰",
# 斯洛伐克
"布拉迪斯拉发": "斯洛伐克", "科希策": "斯洛伐克", "普雷绍夫": "斯洛伐克", "日利纳": "斯洛伐克", "班斯卡比斯特里察": "斯洛伐克",
"尼特拉": "斯洛伐克", "特伦钦": "斯洛伐克", "马丁": "斯洛伐克", "特尔纳瓦": "斯洛伐克", "波普拉德": "斯洛伐克",
"普里维德扎": "斯洛伐克", "兹沃伦": "斯洛伐克", "巴尔代约夫": "斯洛伐克", "列沃恰": "斯洛伐克", "斯皮什斯基堡": "斯洛伐克",
# 斯洛文尼亚
"卢布尔雅那": "斯洛文尼亚", "马里博尔": "斯洛文尼亚", "采列": "斯洛文尼亚", "克拉尼": "斯洛文尼亚", "韦莱涅": "斯洛文尼亚",
"新戈里察": "斯洛文尼亚", "科佩尔": "斯洛文尼亚", "诺沃梅斯托": "斯洛文尼亚", "卡姆尼克": "斯洛文尼亚", "多姆扎勒": "斯洛文尼亚",
"布莱德": "斯洛文尼亚", "博希尼": "斯洛文尼亚", "皮兰": "斯洛文尼亚", "什科茨扬": "斯洛文尼亚", "波斯托伊纳": "斯洛文尼亚",
# 瑞士
"苏黎世": "瑞士", "日内瓦": "瑞士", "巴塞尔": "瑞士", "伯尔尼": "瑞士", "洛桑": "瑞士",
"圣加仑": "瑞士", "卢塞恩": "瑞士", "卢加诺": "瑞士", "比尔": "瑞士", "图恩": "瑞士",
"拉绍德封": "瑞士", "沙夫豪森": "瑞士", "弗里堡": "瑞士", "韦维": "瑞士", "拉佩斯": "瑞士",
"因特拉肯": "瑞士", "采尔马特": "瑞士", "格林德瓦": "瑞士", "少女峰": "瑞士", "马特洪峰": "瑞士",
"圣莫里茨": "瑞士", "洛伊克巴德": "瑞士", "安德马特": "瑞士", "文根": "瑞士", "拉克斯": "瑞士",
# === 北欧 ===
# 瑞典
"斯德哥尔摩": "瑞典", "哥德堡": "瑞典", "马尔默": "瑞典", "乌普萨拉": "瑞典", "林雪平": "瑞典",
"韦斯特罗斯": "瑞典", "厄勒布鲁": "瑞典", "北雪平": "瑞典", "赫尔辛堡": "瑞典", "永雪平": "瑞典",
"松兹瓦尔": "瑞典", "于默奥": "瑞典", "韦克舍": "瑞典", "加夫勒": "瑞典", "博罗斯": "瑞典",
"法伦": "瑞典", "卡尔斯塔德": "瑞典", "卡尔马": "瑞典", "维斯比": "瑞典", "基律纳": "瑞典",
# 挪威
"奥斯陆": "挪威", "卑尔根": "挪威", "特隆赫姆": "挪威", "斯塔万格": "斯洛文尼亚", "克里斯蒂安桑": "挪威",
"腓特烈斯塔": "挪威", "德拉门": "挪威", "谢恩": "挪威", "桑内斯": "挪威", "萨尔普斯堡": "挪威",
"特洛姆瑟": "挪威", "博多": "挪威", "阿尔塔": "挪威", "哈默菲斯特": "挪威", "纳尔维克": "挪威",
"弗洛姆": "挪威", "盖朗厄尔": "挪威", "奥勒松": "挪威", "利勒哈默尔": "挪威", "罗弗敦群岛": "挪威",
# 丹麦
"哥本哈根": "丹麦", "奥胡斯": "丹麦", "欧登塞": "丹麦", "奥尔堡": "丹麦", "埃斯比约": "丹麦",
"兰德斯": "丹麦", "科尔丁": "丹麦", "赫尔辛格": "丹麦", "马里布": "丹麦", "海勒鲁普": "丹麦",
"比隆": "丹麦", "希勒勒": "丹麦", "罗斯基勒": "丹麦", "斯卡恩": "丹麦", "法尔瑟特": "丹麦",
# 芬兰
"赫尔辛基": "芬兰", "埃斯波": "芬兰", "坦佩雷": "芬兰", "万塔": "芬兰", "图尔库": "芬兰",
"奥卢": "芬兰", "拉赫蒂": "芬兰", "库奥皮奥": "芬兰", "约恩苏": "芬兰", "约瓦斯屈莱": "芬兰",
"拉彭兰塔": "芬兰", "科特卡": "芬兰", "瓦萨": "芬兰", "弗绍": "芬兰", "海门林纳": "芬兰",
"罗瓦涅米": "芬兰", "凯米": "芬兰", "托尔尼奥": "芬兰", "萨利色尔卡": "芬兰", "伊瓦洛": "芬兰",
# 冰岛
"雷克雅未克": "冰岛", "科帕沃古尔": "冰岛", "哈夫纳夫约杜尔": "冰岛", "阿克雷里": "冰岛", "雷克雅内斯": "冰岛",
"塞尔福斯": "冰岛", "韦斯特曼纳群岛": "冰岛", "胡萨维克": "冰岛", "埃伊尔斯塔济": "冰岛", "凯夫拉维克": "冰岛",
# === 东欧 ===
# 俄罗斯(欧洲部分)
"莫斯科": "俄罗斯", "圣彼得堡": "俄罗斯", "下诺夫哥罗德": "俄罗斯", "喀山": "俄罗斯", "萨马拉": "俄罗斯",
"伏尔加格勒": "俄罗斯", "罗斯托夫": "俄罗斯", "乌法": "俄罗斯", "彭萨": "俄罗斯", "雅罗斯拉夫": "俄罗斯",
"卡卢加": "俄罗斯", "图拉": "俄罗斯", "弗拉基米尔": "俄罗斯", "苏兹达尔": "俄罗斯", "谢尔盖夫": "俄罗斯",
# 乌克兰
"基辅": "乌克兰", "哈尔科夫": "乌克兰", "敖德萨": "乌克兰", "第聂伯": "乌克兰", "顿涅茨克": "乌克兰",
"扎波罗热": "乌克兰", "利沃夫": "乌克兰", "克里沃罗格": "乌克兰", "尼古拉耶夫": "乌克兰", "马里乌波尔": "乌克兰",
"卢甘斯克": "乌克兰", "文尼察": "乌克兰", "赫尔松": "乌克兰", "切尔卡瑟": "乌克兰", "切尔尼戈夫": "乌克兰",
# 白俄罗斯
"明斯克": "白俄罗斯", "戈梅利": "白俄罗斯", "莫吉廖夫": "白俄罗斯", "维帖布斯克": "白俄罗斯", "格罗德诺": "白俄罗斯",
"布列斯特": "白俄罗斯", "鲍里索夫": "白俄罗斯", "巴拉诺维奇": "白俄罗斯", "平斯克": "白俄罗斯", "奥尔沙": "白俄罗斯",
# 波罗的海三国
"里加": "拉脱维亚", "陶格夫匹尔斯": "拉脱维亚", "利耶帕亚": "拉脱维亚", "叶尔加瓦": "拉脱维亚", "文茨皮尔斯": "拉脱维亚",
"塔林": "爱沙尼亚", "塔尔图": "爱沙尼亚", "纳尔瓦": "爱沙尼亚", "帕尔努": "爱沙尼亚", "科赫特拉": "爱沙尼亚",
"维尔纽斯": "立陶宛", "考纳斯": "立陶宛", "克莱佩达": "立陶宛", "希奥利艾": "立陶宛", "帕内韦日斯": "立陶宛",
# 摩尔多瓦
"基希讷乌": "摩尔多瓦", "蒂拉斯波尔": "摩尔多瓦", "巴尔济": "摩尔多瓦", "本德尔": "摩尔多瓦", "雷布尼察": "摩尔多瓦",
# === 巴尔干半岛 ===
# 克罗地亚
"萨格勒布": "克罗地亚", "斯普利特": "克罗地亚", "里耶卡": "克罗地亚", "奥西耶克": "克罗地亚", "扎达尔": "克罗地亚",
"普拉": "克罗地亚", "杜布罗夫尼克": "克罗地亚", "希贝尼克": "克罗地亚", "卡尔洛瓦茨": "克罗地亚", "瓦拉日丁": "克罗地亚",
"罗维尼": "克罗地亚", "波雷奇": "克罗地亚", "特罗吉尔": "克罗地亚", "赫瓦尔": "克罗地亚", "科尔丘拉": "克罗地亚",
# 塞尔维亚
"贝尔格莱德": "塞尔维亚", "诺维萨德": "塞尔维亚", "尼什": "塞尔维亚", "克拉古耶瓦茨": "塞尔维亚", "苏博蒂察": "塞尔维亚",
"潘切沃": "塞尔维亚", "泽蒙": "塞尔维亚", "莱斯科瓦茨": "塞尔维亚", "恰恰克": "塞尔维亚", "新帕扎尔": "塞尔维亚",
# 波黑
"萨拉热窝": "波黑", "巴尼亚卢卡": "波黑", "图兹拉": "波黑", "泽尼察": "波黑", "莫斯塔尔": "波黑",
"比哈奇": "波黑", "布里耶利纳": "波黑", "多博伊": "波黑", "格拉迪什卡": "波黑", "利夫诺": "波黑",
# 黑山
"波德戈里察": "黑山", "尼克希奇": "黑山", "普里耶波列": "黑山", "比耶洛波列": "黑山", "采蒂涅": "黑山",
"布德瓦": "黑山", "科托尔": "黑山", "乌尔齐尼": "黑山", "赫尔采格诺维": "黑山", "巴尔": "黑山",
# 北马其顿
"斯科普里": "北马其顿", "库马诺沃": "北马其顿", "比托拉": "北马其顿", "普里莱普": "北马其顿", "特托沃": "北马其顿",
"韦莱斯": "北马其顿", "什蒂普": "北马其顿", "奥赫里德": "北马其顿", "戈斯蒂瓦尔": "北马其顿", "斯特鲁加": "北马其顿",
# 阿尔巴尼亚
"地拉那": "阿尔巴尼亚", "都拉斯": "阿尔巴尼亚", "埃尔巴桑": "阿尔巴尼亚", "发罗拉": "阿尔巴尼亚", "斯库台": "阿尔巴尼亚",
"科尔察": "阿尔巴尼亚", "卢什涅": "阿尔巴尼亚", "费里": "阿尔巴尼亚", "贝拉特": "阿尔巴尼亚", "吉诺卡斯特": "阿尔巴尼亚",
# 保加利亚
"索菲亚": "保加利亚", "普罗夫迪夫": "保加利亚", "瓦尔纳": "保加利亚", "布尔加斯": "保加利亚", "鲁塞": "保加利亚",
"斯塔拉扎戈拉": "保加利亚", "普列文": "保加利亚", "슬리문": "保加利亚", "多布里奇": "保加利亚", "舒门": "保加利亚",
"帕扎尔吉克": "保加利亚", "哈斯科沃": "保加利亚", "扬博尔": "保加利亚", "布拉戈耶夫格勒": "保加利亚", "韦利科特尔诺沃": "保加利亚",
# 罗马尼亚
"布加勒斯特": "罗马尼亚", "克卢日": "罗马尼亚", "蒂米什瓦拉": "罗马尼亚", "雅西": "罗马尼亚", "康斯坦察": "罗马尼亚",
"克拉约瓦": "罗马尼亚", "布拉索夫": "罗马尼亚", "加拉茨": "罗马尼亚", "普洛耶什蒂": "罗马尼亚", "奥拉迪亚": "罗马尼亚",
"布勒伊拉": "罗马尼亚", "阿拉德": "罗马尼亚", "皮特什蒂": "罗马尼亚", "锡比乌": "罗马尼亚", "巴克乌": "罗马尼亚",
"锡纳亚": "罗马尼亚", "布兰": "罗马尼亚", "德古拉城堡": "罗马尼亚", "佩莱什城堡": "罗马尼亚", "马拉穆雷什": "罗马尼亚",
# 土耳其(欧洲部分)
"伊斯坦布尔": "土耳其", "埃迪尔内": "土耳其", "泰基尔达": "土耳其", "克尔克拉雷利": "土耳其", "恰纳卡莱": "土耳其",
# 塞浦路斯
"尼科西亚": "塞浦路斯", "利马索尔": "塞浦路斯", "拉纳卡": "塞浦路斯", "法马古斯塔": "塞浦路斯", "帕福斯": "塞浦路斯",
"凯里尼亚": "塞浦路斯", "阿依纳帕": "塞浦路斯", "普罗塔拉斯": "塞浦路斯", "特罗多斯": "塞浦路斯", "阿卡马斯": "塞浦路斯",
# 马耳他
"瓦莱塔": "马耳他", "斯利马": "马耳他", "圣朱利安斯": "马耳他", "姆西达": "马耳他", "维多利亚": "马耳他",
"马尔萨什洛克": "马耳他", "梅利哈": "马耳他", "戈佐": "马耳他", "蓝湖": "马耳他", "姆迪纳": "马耳他",
}
# 欧洲城市别名映射(包含各种表达方式)
self.european_city_aliases = {
# 英文名称映射
"paris": "巴黎", "rome": "罗马", "london": "伦敦", "berlin": "柏林",
"madrid": "马德里", "barcelona": "巴塞罗那", "vienna": "维也纳", "prague": "布拉格",
"amsterdam": "阿姆斯特丹", "florence": "佛罗伦萨", "venice": "威尼斯", "athens": "雅典",
"budapest": "布达佩斯", "lisbon": "里斯本", "stockholm": "斯德哥尔摩", "copenhagen": "哥本哈根",
"helsinki": "赫尔辛基", "oslo": "奥斯陆", "zurich": "苏黎世", "geneva": "日内瓦",
"munich": "慕尼黑", "milan": "米兰", "naples": "那不勒斯", "nice": "尼斯",
"edinburgh": "爱丁堡", "dublin": "都柏林", "brussels": "布鲁塞尔", "warsaw": "华沙",
"krakow": "克拉科夫", "zagreb": "萨格勒布", "belgrade": "贝尔格莱德", "sofia": "索菲亚",
"bucharest": "布加勒斯特", "kiev": "基辅", "moscow": "莫斯科", "st petersburg": "圣彼得堡",
"reykjavik": "雷克雅未克", "tallinn": "塔林", "riga": "里加", "vilnius": "维尔纽斯",
"bratislava": "布拉迪斯拉发", "ljubljana": "卢布尔雅那", "sarajevo": "萨拉热窝",
"dubrovnik": "杜布罗夫尼克", "split": "斯普利特", "santorini": "圣托里尼", "mykonos": "米科诺斯",
# 中文别名
"花都": "巴黎", "光之城": "巴黎", "永恒之城": "罗马", "雾都": "伦敦",
"音乐之都": "维也纳", "黄金城市": "布拉格", "千塔之城": "布拉格",
"运河之城": "阿姆斯特丹", "翡冷翠": "佛罗伦萨", "文艺复兴之都": "佛罗伦萨",
"水城": "威尼斯", "西方文明的摇篮": "雅典", "多瑙河明珠": "布达佩斯",
"七丘之城": "里斯本", "北方威尼斯": "斯德哥尔摩", "童话之都": "哥本哈根",
"波罗的海的女儿": "赫尔辛基", "欧洲屋脊": "因特拉肯", "北方雅典": "爱丁堡",
"翡翠岛": "都柏林", "欧洲之都": "布鲁塞尔", "高迪之城": "巴塞罗那",
}
self.chinese_numbers = {
'一': 1, '二': 2, '三': 3, '四': 4, '五': 5, '六': 6, '七': 7, '八': 8, '九': 9, '十': 10,
'两': 2, '半': 0.5, '壹': 1, '贰': 2, '叁': 3, '肆': 4, '伍': 5, '陆': 6, '柒': 7, '捌': 8, '玖': 9, '拾': 10,
# 英文数字
'one': 1, 'two': 2, 'three': 3, 'four': 4, 'five': 5, 'six': 6, 'seven': 7, 'eight': 8, 'nine': 9, 'ten': 10,
'eleven': 11, 'twelve': 12, 'thirteen': 13, 'fourteen': 14, 'fifteen': 15,
# 特殊时长表达
'半个月': 15, '一个月': 30, '半年': 180, '一年': 365,
'半天': 0.5, '一天': 1, '两天': 2, '三天': 3, '四天': 4, '五天': 5, '六天': 6, '七天': 7,
'八天': 8, '九天': 9, '十天': 10, '半周': 3.5, '一周': 7, '两周': 14,
# 假期相关
'小长假': 3, '长假': 7, '十一': 7, '国庆': 7, '春节': 7, '五一': 3, '清明': 3,
'端午': 3, '中秋': 3, '元旦': 3, '暑假': 60, '寒假': 30, '周末': 2, '长周末': 3,
# 英文假期
'weekend': 2, 'week': 7, 'month': 30, 'vacation': 7, 'holiday': 3
}
def extract(self, user_message: str,existing_info: dict = None) -> dict:
# 输入验证
if not user_message or not isinstance(user_message, str):
log.warning("⚠️ 收到无效的用户消息")
return existing_info or {}
if len(user_message.strip()) < 2:
log.warning("⚠️ 用户消息过短,跳过信息提取")
return existing_info or {}
if existing_info:
log.info(f"接收到上下文信息,将在此基础上更新: {existing_info}")
result = copy.deepcopy(existing_info)
else:
result = {}
log.info(f"🛠️ 使用分词策略提取信息:'{user_message[:50]}...'")
# 1. 智能分词
tokens = self._tokenize_message(user_message)
log.info(f"📝 分词结果:{tokens}")
# 2. 基于分词进行信息提取
newly_extracted_info = {}
# 提取目的地信息
destination_info = self._extract_destination_from_tokens(tokens)
if destination_info:
newly_extracted_info["destination"] = destination_info
# 提取时长信息
duration_info = self._extract_duration_from_tokens(tokens)
if duration_info:
newly_extracted_info["duration"] = duration_info
# 提取预算信息
budget_info = self._extract_budget_from_tokens(tokens)
if budget_info:
newly_extracted_info["budget"] = budget_info
log.info(f"📊 分词提取结果: {newly_extracted_info}")
return newly_extracted_info
def _merge_info(self, new_info: dict, existing_info: dict) -> dict:
for key, value in new_info.items():
# 如果新旧信息中同一个键的值都是字典,则递归深入合并
if isinstance(value, dict) and key in existing_info and isinstance(existing_info[key], dict):
self._merge_info(value, existing_info[key])
else:
# 否则,直接用新信息覆盖或添加
existing_info[key] = value
return existing_info
def _tokenize_message(self, text: str) -> list:
"""智能分词,支持中英文混合"""
# 预处理:统一标点符号和空格
text = text.replace(',', ',').replace('。', '.').replace('!', '!').replace('?', '?')
text = text.replace('(', '(').replace(')', ')').replace('【', '[').replace('】', ']')
tokens = []
current_token = ""
i = 0
while i < len(text):
char = text[i]
# 处理空格和标点符号
if char in ' ,,.。!!??()()[]【】::;;':
if current_token:
tokens.append(current_token)
current_token = ""
if char.strip(): # 保留非空格的标点符号
tokens.append(char)
i += 1
continue
# 处理数字(包括小数和货币符号)
if char.isdigit() or char in '¥$€£₩':
if current_token and not (current_token[-1].isdigit() or current_token[-1] in '¥$€£₩.'):
tokens.append(current_token)
current_token = char
else:
current_token += char
# 继续读取数字部分
i += 1
while i < len(text) and (text[i].isdigit() or text[i] in '.,'):
current_token += text[i]
i += 1
# 检查货币单位
currency_units = ['元', '块', '钱', '欧', '美元', '英镑', '日元', '韩元', '瑞郎', 'rmb', 'usd', 'eur', 'gbp', 'jpy', 'krw', 'chf']
remaining_text = text[i:].lower()
for unit in currency_units:
if remaining_text.startswith(unit):
current_token += text[i:i+len(unit)]
i += len(unit)
break
tokens.append(current_token)
current_token = ""
continue
# 处理英文单词
if char.isalpha() and ord(char) < 128: # ASCII字符
if current_token and not current_token[-1].isalpha():
tokens.append(current_token)
current_token = char
else:
current_token += char
# 继续读取英文字符
i += 1
while i < len(text) and text[i].isalpha() and ord(text[i]) < 128:
current_token += text[i]
i += 1
tokens.append(current_token)
current_token = ""
continue
# 处理中文字符
if self._is_chinese_char(char):
if current_token and not self._is_chinese_char(current_token[-1]):
tokens.append(current_token)
current_token = ""
# 对于中文,我们需要智能分词
# 检查是否是多字符城市名、时间表达等
remaining_text = text[i:]
# 尝试匹配城市名
matched_city = self._match_city_name(remaining_text)
if matched_city:
tokens.append(matched_city)
i += len(matched_city)
continue
# 尝试匹配时间表达
matched_time = self._match_time_expression(remaining_text)
if matched_time:
tokens.append(matched_time)
i += len(matched_time)
continue
# 尝试匹配预算类型关键词
matched_budget_type = self._match_budget_type(remaining_text)
if matched_budget_type:
tokens.append(matched_budget_type)
i += len(matched_budget_type)
continue
# 尝试匹配常见词汇
matched_word = self._match_common_word(remaining_text)
if matched_word:
tokens.append(matched_word)
i += len(matched_word)
continue
# 单个中文字符
tokens.append(char)
i += 1
else:
# 其他字符
current_token += char
i += 1
# 处理最后的token
if current_token:
tokens.append(current_token)
# 后处理:合并一些相关的tokens
tokens = self._post_process_tokens(tokens)
return [token for token in tokens if token.strip()] # 过滤空token
def _is_chinese_char(self, char: str) -> bool:
"""判断是否为中文字符"""
return '\u4e00' <= char <= '\u9fff'
def _match_city_name(self, text: str) -> str:
"""匹配城市名称"""
# 按长度从长到短排序,优先匹配长的城市名
all_cities = list(self.european_cities.keys()) + list(self.european_city_aliases.keys())
all_cities = sorted(set(all_cities), key=len, reverse=True)
for city in all_cities:
if text.startswith(city):
return city
return ""
def _match_time_expression(self, text: str) -> str:
"""匹配时间表达"""
time_expressions = [
# 多字符时间表达
'半个月', '一个月', '两个月', '三个月', '半年', '一年',
'小长假', '长周末', '国庆节', '春节假期', '暑假', '寒假',
'一天半', '两天半', '三天半', '一周半', '两周',
# 英文时间表达
'one day', 'two days', 'three days', 'one week', 'two weeks',
'long weekend', 'vacation', 'holiday', 'spring break'
]
# 按长度排序,优先匹配长表达
time_expressions = sorted(time_expressions, key=len, reverse=True)
text_lower = text.lower()
for expr in time_expressions:
if text_lower.startswith(expr.lower()):
return expr
if text.startswith(expr):
return expr
return ""
def _match_budget_type(self, text: str) -> str:
"""匹配预算类型关键词"""
budget_keywords = [
# 经济型
'经济实惠', '省钱', '便宜', '实惠', '经济', '穷游', '背包客',
'青年旅社', '学生', '预算有限', '性价比',
# 舒适型
'舒适', '中等', '适中', '标准', '普通', '中档', '合理',
# 豪华型
'豪华', '奢华', '高端', '顶级', '精品', '五星', '不差钱',
'任性', '土豪', 'VIP', '贵族', '皇家'
]
# 按长度排序
budget_keywords = sorted(budget_keywords, key=len, reverse=True)
for keyword in budget_keywords:
if text.startswith(keyword):
return keyword
return ""
def _match_common_word(self, text: str) -> str:
"""匹配常见词汇"""
common_words = [
# 旅行相关动词
'想去', '计划去', '打算去', '准备去', '希望去', '考虑去',
'前往', '旅行', '旅游', '游玩', '度假', '出发', '飞往',
# 时间相关
'三天', '四天', '五天', '六天', '七天', '八天', '九天', '十天',
'一天', '两天', '几天', '多天', '数天',
# 预算相关
'预算', '花费', '费用', '成本', '开销', '支出', '消费',
'总共', '一共', '大概', '约', '左右', '差不多',
# 其他
'行程', '计划', '安排', '路线', '攻略'
]
# 按长度排序
common_words = sorted(common_words, key=len, reverse=True)
for word in common_words:
if text.startswith(word):
return word
return ""
def _post_process_tokens(self, tokens: list) -> list:
"""后处理tokens,合并相关的片段"""
if not tokens:
return tokens
processed = []
i = 0
while i < len(tokens):
current_token = tokens[i]
# 合并数字+单位的组合
if i < len(tokens) - 1:
next_token = tokens[i + 1]
# 数字 + 货币单位
if (current_token.isdigit() and
next_token.lower() in ['元', '块', '钱', '欧', '美元', '英镑', '日元', 'rmb', 'usd', 'eur', 'gbp', 'jpy']):
processed.append(current_token + next_token)
i += 2
continue
# 数字 + 时间单位
if (current_token.isdigit() and
next_token in ['天', '日', '周', '月', '年', 'days', 'weeks', 'months']):
processed.append(current_token + next_token)
i += 2
continue
# 预算 + 数字
if current_token == '预算' and next_token.replace('.', '').replace(',', '').isdigit():
if i < len(tokens) - 2 and tokens[i + 2] in ['元', '块', '钱', '欧', 'rmb', 'usd', 'eur']:
processed.append(current_token + next_token + tokens[i + 2])
i += 3
continue
else:
processed.append(current_token + next_token)
i += 2
continue
processed.append(current_token)
i += 1
return processed
def _extract_destination_from_tokens(self, tokens: list) -> dict:
"""从tokens中提取目的地信息"""
result = {}
# 查找城市名
for i, token in enumerate(tokens):
# 直接匹配城市名
city_name = self._normalize_city_name(token)
if city_name:
result["name"] = city_name
if city_name in self.european_cities:
result["country"] = self.european_cities[city_name]
break
# 检查是否在动词后面
if i > 0:
prev_token = tokens[i - 1]
if prev_token in ['去', '到', '想去', '前往', '旅行', '游', '玩', 'go', 'to', 'visit', 'travel']:
city_name = self._normalize_city_name(token)
if city_name:
result["name"] = city_name
if city_name in self.european_cities:
result["country"] = self.european_cities[city_name]
break
# 如果没有找到,尝试fuzzy匹配
if not result:
for token in tokens:
if len(token) >= 2:
# 模糊匹配城市名
for city, country in self.european_cities.items():
if token in city or city in token:
if len(token) >= len(city) * 0.6: # 相似度阈值
result["name"] = city
result["country"] = country
break
if result:
break
return result
def _normalize_city_name(self, token: str) -> str:
"""标准化城市名称"""
if not token:
return ""
token_lower = token.lower().strip()
# 直接匹配
if token in self.european_cities:
return token
# 别名匹配
if token_lower in self.european_city_aliases:
return self.european_city_aliases[token_lower]
if token in self.european_city_aliases:
return self.european_city_aliases[token]
return ""
def _extract_duration_from_tokens(self, tokens: list) -> dict:
"""从tokens中提取时长信息"""
result = {}
for i, token in enumerate(tokens):
days = None
description = ""
# 处理 "数字+天" 的token
if re.match(r'^\d+[天日]$', token):
days = int(re.findall(r'\d+', token)[0])
# 处理 "数字+weeks/days" 的token
elif re.match(r'^\d+(days?|weeks?|months?)$', token.lower()):
number = int(re.findall(r'\d+', token)[0])
unit = re.findall(r'[a-zA-Z]+', token.lower())[0]
if unit.startswith('day'):
days = number
elif unit.startswith('week'):
days = number * 7
elif unit.startswith('month'):
days = number * 30
# 处理分离的数字和单位
elif token.isdigit() and i < len(tokens) - 1:
next_token = tokens[i + 1]
number = int(token)
if next_token in ['天', '日']:
days = number
elif next_token in ['周', '星期', '礼拜', 'week', 'weeks']:
days = number * 7
elif next_token in ['月', '个月', 'month', 'months']:
days = number * 30
# 处理中文数字
elif token in self.chinese_numbers:
days = self.chinese_numbers[token]
description = token
# 处理特殊时长表达
elif token in ['周末', 'weekend']:
days = 2
description = token
elif token in ['长周末', 'long weekend']:
days = 3
description = token
elif token in ['小长假', 'vacation', 'holiday']:
days = 3
description = token
elif token in ['十一', '国庆', 'national day']:
days = 7
description = token
elif token in ['春节', 'spring festival']:
days = 7
description = token
elif token in ['暑假', 'summer vacation']:
days = 60
description = token
elif token in ['寒假', 'winter vacation']:
days = 30
description = token
# 处理复合表达 "三天两夜"
elif re.match(r'^[一二三四五六七八九十\d]+天', token):
# 提取数字部分
for num_token in ['一', '二', '三', '四', '五', '六', '七', '八', '九', '十']:
if token.startswith(num_token):
days = self.chinese_numbers[num_token]
description = token
break
if not days and token[0].isdigit():
days = int(token[0])
description = token
# 验证天数合理性并设置结果
if days and 0.5 <= days <= 365:
result["days"] = int(days) if days >= 1 else days
if not description:
# 添加描述信息
if days <= 1:
description = "当日往返"
elif days <= 3:
description = "短途旅行"
elif days <= 7:
description = "一周内旅行"
elif days <= 14:
description = "中长途旅行"
elif days <= 30:
description = "长途旅行"
else:
description = "超长途旅行"
result["description"] = description
break
return result
def _extract_budget_from_tokens(self, tokens: list) -> dict:
"""从tokens中提取预算信息"""
result = {}
# 1. 查找金额
for i, token in enumerate(tokens):
amount = None
currency = "RMB" # 默认货币
# 处理包含货币的token "2000欧", "5000元"
currency_patterns = [
(r'(\d+(?:\.\d+)?)欧(?:元)?', 'EUR'),
(r'(\d+(?:\.\d+)?)元', 'RMB'),
(r'(\d+(?:\.\d+)?)块(?:钱)?', 'RMB'),
(r'(\d+(?:\.\d+)?)人民币', 'RMB'),
(r'(\d+(?:\.\d+)?)美元', 'USD'),
(r'(\d+(?:\.\d+)?)英镑', 'GBP'),
(r'(\d+(?:\.\d+)?)瑞(?:士)?法郎', 'CHF'),
(r'(\d+(?:\.\d+)?)日元', 'JPY'),
(r'(\d+(?:\.\d+)?)韩元', 'KRW'),
(r'¥(\d+(?:\.\d+)?)', 'RMB'),
(r'€(\d+(?:\.\d+)?)', 'EUR'),
(r'\$(\d+(?:\.\d+)?)', 'USD'),
(r'£(\d+(?:\.\d+)?)', 'GBP'),
(r'(\d+(?:\.\d+)?)rmb', 'RMB'),
(r'(\d+(?:\.\d+)?)usd', 'USD'),
(r'(\d+(?:\.\d+)?)eur', 'EUR'),
(r'(\d+(?:\.\d+)?)gbp', 'GBP'),
(r'(\d+(?:\.\d+)?)chf', 'CHF'),
]
for pattern, curr in currency_patterns:
match = re.search(pattern, token.lower())
if match:
amount = float(match.group(1))
currency = curr
break
# 处理纯数字token(需要查看上下文)
if not amount and re.match(r'^\d+(?:\.\d+)?$', token):
number = float(token)
# 检查前面的token是否有预算相关词汇
budget_indicators = ['预算', '花费', '费用', '成本', '开销', '支出', '总共', '一共', 'budget', 'cost', 'spend']
has_budget_context = False
if i > 0 and tokens[i-1] in budget_indicators:
has_budget_context = True
elif i > 1 and tokens[i-2] in budget_indicators:
has_budget_context = True
# 检查后面是否有货币单位
if i < len(tokens) - 1:
next_token = tokens[i + 1].lower()
currency_units = {
'元': 'RMB', '块': 'RMB', '钱': 'RMB', '人民币': 'RMB',
'欧': 'EUR', '欧元': 'EUR', '美元': 'USD', '英镑': 'GBP',
'瑞郎': 'CHF', '日元': 'JPY', '韩元': 'KRW',
'rmb': 'RMB', 'usd': 'USD', 'eur': 'EUR', 'gbp': 'GBP', 'chf': 'CHF'
}
if next_token in currency_units:
amount = number
currency = currency_units[next_token]
has_budget_context = True
# 如果有预算上下文但没有明确货币单位,根据数字大小推断
if has_budget_context and not amount:
if number < 100: # 可能是欧元或美元
# 查看是否有欧洲城市上下文
has_european_context = any(self._normalize_city_name(t) for t in tokens)
if has_european_context:
currency = 'EUR'
else:
currency = 'USD'
else:
currency = 'RMB' # 大数字更可能是人民币
amount = number
# 处理万、千等单位
if amount:
# 检查是否有万、千修饰符
if i > 0:
prev_token = tokens[i-1]
if '万' in prev_token or 'w' in prev_token.lower():
amount *= 10000
elif '千' in prev_token or 'k' in prev_token.lower():
amount *= 1000
elif i < len(tokens) - 1:
next_token = tokens[i+1]
if '万' in next_token or 'w' in next_token.lower():
amount *= 10000
elif '千' in next_token or 'k' in next_token.lower():
amount *= 1000
if amount > 0:
result["amount"] = int(amount)
result["currency"] = currency
break
# 2. 查找预算类型
budget_type_keywords = {
'economy': [
'经济', '便宜', '省钱', '实惠', '节省', '穷游', '学生', '青年',
'预算有限', '钱不多', '不贵', '划算', '性价比', '背包客',
'简单', '基础', '低成本', '节约', 'budget', 'cheap', 'economy', 'affordable'
],
'comfortable': [
'舒适', '中等', '适中', '一般', '标准', '普通', '正常', '常规',
'中档', '中级', '合理', '平均', '中间档次', 'comfortable', 'standard', 'moderate'
],
'luxury': [
'豪华', '奢华', '高端', '顶级', '精品', '奢侈', '贵族', '皇家',
'贵一点', '不差钱', '任性', '土豪', '有钱', '五星', 'VIP',
'luxury', 'premium', 'high-end', 'expensive', 'fancy'
]
}
for token in tokens:
token_lower = token.lower()
for budget_type, keywords in budget_type_keywords.items():
if any(keyword in token_lower for keyword in keywords):
result["type"] = budget_type
# 找到第一个匹配的关键词作为描述
for keyword in keywords:
if keyword in token_lower:
result["description"] = keyword if len(keyword) > 2 else token
break
break
if result.get("type"):
break
# 3. 如果有金额但没有类型,根据金额推断类型
if result.get("amount") and not result.get("type"):
amount = result["amount"]
currency = result.get("currency", "RMB")
# 根据欧洲旅行成本设置阈值
if currency == "EUR":
if amount < 1500: # 总预算
result["type"] = "economy"
result["description"] = "经济预算"
elif amount < 4000:
result["type"] = "comfortable"
result["description"] = "舒适预算"
else:
result["type"] = "luxury"
result["description"] = "豪华预算"
elif currency == "USD":
if amount < 2000:
result["type"] = "economy"
result["description"] = "经济预算"
elif amount < 5000:
result["type"] = "comfortable"
result["description"] = "舒适预算"
else:
result["type"] = "luxury"
result["description"] = "豪华预算"
elif currency == "RMB":
if amount < 8000:
result["type"] = "economy"
result["description"] = "经济预算"
elif amount < 20000:
result["type"] = "comfortable"
result["description"] = "舒适预算"
else:
result["type"] = "luxury"
result["description"] = "豪华预算"
# 4. 处理中文数字金额
chinese_money_mapping = {
'一千': 1000, '两千': 2000, '三千': 3000, '四千': 4000, '五千': 5000,
'六千': 6000, '七千': 7000, '八千': 8000, '九千': 9000,
'一万': 10000, '两万': 20000, '三万': 30000, '四万': 40000, '五万': 50000
}
if not result.get("amount"):
for token in tokens:
if token in chinese_money_mapping:
result["amount"] = chinese_money_mapping[token]
result["currency"] = "RMB"
break
return result
# 保持向后兼容的验证方法
def _validate_and_normalize(self, data: dict) -> dict:
"""验证和规范化数据"""
return data |