Spaces:
Sleeping
Sleeping
| import json | |
| import re | |
| from utils.logger import log | |
| import jieba | |
| from typing import List, Tuple | |
| import copy | |
| class InfoExtractor: | |
| def __init__(self): | |
| self.extraction_schema = { | |
| "destination": {"type": dict, "fields": {"name": str, "country": str}}, | |
| "duration": {"type": dict, "fields": {"days": int, "description": str}}, | |
| "budget": {"type": dict, "fields": {"type": str, "amount": int, "currency": str, "description": str}} | |
| } | |
| # 欧洲城市和国家的完整映射关系(聚焦欧洲) | |
| self.european_cities = { | |
| # === 西欧 === | |
| # 法国 | |
| "巴黎": "法国", "里昂": "法国", "马赛": "法国", "尼斯": "法国", "戛纳": "法国", | |
| "图卢兹": "法国", "南特": "法国", "斯特拉斯堡": "法国", "蒙彼利埃": "法国", "波尔多": "法国", | |
| "里尔": "法国", "雷恩": "法国", "兰斯": "法国", "勒阿弗尔": "法国", "圣埃蒂安": "法国", | |
| "土伦": "法国", "阿维尼翁": "法国", "凡尔赛": "法国", "枫丹白露": "法国", "第戎": "法国", | |
| "昂热": "法国", "贝桑松": "法国", "佩皮尼昂": "法国", "卢尔德": "法国", "沙特尔": "法国", | |
| # 德国 | |
| "柏林": "德国", "慕尼黑": "德国", "汉堡": "德国", "科隆": "德国", "法兰克福": "德国", | |
| "斯图加特": "德国", "杜塞尔多夫": "德国", "多特蒙德": "德国", "埃森": "德国", "莱比锡": "德国", | |
| "不来梅": "德国", "德累斯顿": "德国", "汉诺威": "德国", "纽伦堡": "德国", "杜伊斯堡": "德国", | |
| "波鸿": "德国", "乌珀塔尔": "德国", "比勒费尔德": "德国", "波恩": "德国", "明斯特": "德国", | |
| "卡尔斯鲁厄": "德国", "曼海姆": "德国", "奥格斯堡": "德国", "威斯巴登": "德国", "盖尔森基兴": "德国", | |
| "门兴格拉德巴赫": "德国", "布伦瑞克": "德国", "基尔": "德国", "亚琛": "德国", "哈雷": "德国", | |
| "马格德堡": "德国", "弗莱堡": "德国", "克里菲尔德": "德国", "吕贝克": "德国", "奥伯豪森": "德国", | |
| "埃尔福特": "德国", "罗斯托克": "德国", "凯泽斯劳滕": "德国", "卡塞尔": "德国", "哈根": "德国", | |
| "波茨坦": "德国", "萨尔布吕肯": "德国", "路德维希港": "德国", "奥尔登堡": "德国", "莱沃库森": "德国", | |
| "奥斯纳布吕克": "德国", "索林根": "德国", "海德堡": "德国", "达姆施塔特": "德国", "哈姆": "德国", | |
| "维尔茨堡": "德国", "雷克林豪森": "德国", "沃尔夫斯堡": "德国", "格廷根": "德国", "科特布斯": "德国", | |
| "希尔德斯海姆": "德国", "埃朗根": "德国", "特里尔": "德国", "耶拿": "德国", "康斯坦茨": "德国", | |
| "新天鹅堡": "德国", "罗滕堡": "德国", "科布伦茨": "德国", "班贝格": "德国", "拜罗伊特": "德国", | |
| # 英国 | |
| "伦敦": "英国", "伯明翰": "英国", "曼彻斯特": "英国", "格拉斯哥": "英国", "利物浦": "英国", | |
| "利兹": "英国", "谢菲尔德": "英国", "爱丁堡": "英国", "布里斯托": "英国", "莱斯特": "英国", | |
| "考文垂": "英国", "布拉德福德": "英国", "贝尔法斯特": "英国", "卡迪夫": "英国", "诺丁汉": "英国", | |
| "金斯顿": "英国", "纽卡斯尔": "英国", "普利茅斯": "英国", "斯托克": "英国", "南安普顿": "英国", | |
| "雷丁": "英国", "德比": "英国", "约克": "英国", "牛津": "英国", "剑桥": "英国", | |
| "巴斯": "英国", "温莎": "英国", "坎特伯雷": "英国", "斯特拉特福": "英国", "湖区": "英国", | |
| "斯凯岛": "英国", "爱丁堡": "英国", "格拉斯哥": "英国", "史德灵": "英国", "珀斯": "英国", | |
| "因弗内斯": "英国", "阿伯丁": "英国", "邓迪": "英国", "法夫": "英国", "奥班": "英国", | |
| # 荷兰 | |
| "阿姆斯特丹": "荷兰", "鹿特丹": "荷兰", "海牙": "荷兰", "乌得勒支": "荷兰", "埃因霍温": "荷兰", | |
| "蒂尔堡": "荷兰", "格罗宁根": "荷兰", "阿尔梅勒": "荷兰", "布雷达": "荷兰", "奈梅亨": "荷兰", | |
| "阿珀尔多伦": "荷兰", "哈勒姆": "荷兰", "阿纳姆": "荷兰", "恩斯赫德": "荷兰", "阿默斯福特": "荷兰", | |
| "赞丹": "荷兰", "海牙": "荷兰", "阿尔克马尔": "荷兰", "马斯特里赫特": "荷兰", "莱顿": "荷兰", | |
| "代尔夫特": "荷兰", "多德雷赫特": "荷兰", "豪达": "荷兰", "羊角村": "荷兰", "马尔肯": "荷兰", | |
| # 比利时 | |
| "布鲁塞尔": "比利时", "安特卫普": "比利时", "根特": "比利时", "沙勒罗瓦": "比利时", "列日": "比利时", | |
| "布吕赫": "比利时", "那慕尔": "比利时", "蒙斯": "比利时", "阿尔斯特": "比利时", "科特赖克": "比利时", | |
| "哈瑟尔特": "比利时", "圣尼古拉": "比利时", "奥斯坦德": "比利时", "梅赫伦": "比利时", "鲁汶": "比利时", | |
| # 卢森堡 | |
| "卢森堡市": "卢森堡", "埃施": "卢森堡", "迪费当日": "卢森堡", "杜德朗日": "卢森堡", | |
| # === 南欧 === | |
| # 意大利 | |
| "罗马": "意大利", "米兰": "意大利", "威尼斯": "意大利", "佛罗伦萨": "意大利", "那不勒斯": "意大利", | |
| "都灵": "意大利", "帕勒莫": "意大利", "热那亚": "意大利", "博洛尼亚": "意大利", "巴里": "意大利", | |
| "卡塔尼亚": "意大利", "佛罗伦萨": "意大利", "韦罗纳": "意大利", "威尼斯": "意大利", "墨西拿": "意大利", | |
| "帕多瓦": "意大利", "的里雅斯特": "意大利", "塔兰托": "意大利", "布雷西亚": "意大利", "摩德纳": "意大利", | |
| "雷焦卡拉布里亚": "意大利", "普拉托": "意大利", "卡利亚里": "意大利", "帕尔马": "意大利", "佩鲁贾": "意大利", | |
| "利沃诺": "意大利", "雷焦艾米利亚": "意大利", "佛嘉": "意大利", "萨莱诺": "意大利", "拉温纳": "意大利", | |
| "里米尼": "意大利", "拉斯佩齐亚": "意大利", "萨萨里": "意大利", "蒙扎": "意大利", "贝加莫": "意大利", | |
| "比萨": "意大利", "维琴察": "意大利", "三月十五日": "意大利", "博尔扎诺": "意大利", "安德里亚": "意大利", | |
| "阿雷佐": "意大利", "蒂沃利": "意大利", "阿西西": "意大利", "锡耶纳": "意大利", "五渔村": "意大利", | |
| "马泰拉": "意大利", "庞贝": "意大利", "卡普里岛": "意大利", "阿马尔菲": "意大利", "科莫": "意大利", | |
| # 西班牙 | |
| "马德里": "西班牙", "巴塞罗那": "西班牙", "瓦伦西亚": "西班牙", "塞维利亚": "西班牙", "萨拉戈萨": "西班牙", | |
| "马拉加": "西班牙", "穆尔西亚": "西班牙", "帕尔马": "西班牙", "拉斯帕尔马斯": "西班牙", "毕尔巴鄂": "西班牙", | |
| "阿利坎特": "西班牙", "科尔多瓦": "西班牙", "巴利亚多利德": "西班牙", "维戈": "西班牙", "希洪": "西班牙", | |
| "莱昂": "西班牙", "拉科鲁尼亚": "西班牙", "埃尔切": "西班牙", "奥维耶多": "西班牙", "圣塞巴斯蒂安": "西班牙", | |
| "桑坦德": "西班牙", "卡斯特利翁": "西班牙", "洛格罗尼奥": "西班牙", "巴达霍斯": "西班牙", "萨拉曼卡": "西班牙", | |
| "韦尔瓦": "西班牙", "阿尔梅里亚": "西班牙", "卡迪斯": "西班牙", "格拉纳达": "西班牙", "托莱多": "西班牙", | |
| "昆卡": "西班牙", "卡塞雷斯": "西班牙", "塞哥维亚": "西班牙", "阿维拉": "西班牙", "布尔戈斯": "西班牙", | |
| "马略卡岛": "西班牙", "伊比萨": "西班牙", "特内里费": "西班牙", "大加那利": "西班牙", "兰萨罗特": "西班牙", | |
| # 葡萄牙 | |
| "里斯本": "葡萄牙", "波尔图": "葡萄牙", "阿马多拉": "葡萄牙", "布拉加": "葡萄牙", "塞图巴尔": "葡萄牙", | |
| "科英布拉": "葡萄牙", "丰沙尔": "葡萄牙", "阿威罗": "葡萄牙", "埃武拉": "葡萄牙", "法鲁": "葡萄牙", | |
| "阿尔布费拉": "葡萄牙", "辛特拉": "葡萄牙", "卡斯凯什": "葡萄牙", "奥比杜什": "葡萄牙", "波尔塔莱格雷": "葡萄牙", | |
| "吉马良斯": "葡萄牙", "维亚纳堡": "葡萄牙", "维塞乌": "葡萄牙", "拉戈什": "葡萄牙", "萨格里什": "葡萄牙", | |
| # 希腊 | |
| "雅典": "希腊", "塞萨洛尼基": "希腊", "帕特雷": "希腊", "伊拉克利翁": "希腊", "拉里萨": "希腊", | |
| "沃洛斯": "希腊", "约阿尼纳": "希腊", "卡瓦拉": "希腊", "哈尼亚": "希腊", "塞雷斯": "希腊", | |
| "圣托里尼": "希腊", "米科诺斯": "希腊", "罗德岛": "希腊", "科孚": "希腊", "克里特": "希腊", | |
| "帕罗斯": "希腊", "纳克索斯": "希腊", "扎金索斯": "希腊", "凯法利尼亚": "希腊", "斯基亚索斯": "希腊", | |
| "德尔菲": "希腊", "奥林匹亚": "希腊", "迈锡尼": "希腊", "埃皮达鲁斯": "希腊", "梅泰奥拉": "希腊", | |
| # === 中欧 === | |
| # 奥地利 | |
| "维也纳": "奥地利", "格拉茨": "奥地利", "林茨": "奥地利", "萨尔茨堡": "奥地利", "因斯布鲁克": "奥地利", | |
| "克拉根福": "奥地利", "菲拉赫": "奥地利", "韦尔斯": "奥地利", "圣珀尔滕": "奥地利", "多恩比恩": "奥地利", | |
| "维也纳新城": "奥地利", "施泰尔": "奥地利", "费尔德基兴": "奥地利", "布鲁克": "奥地利", "莱奥本": "奥地利", | |
| "哈尔施塔特": "奥地利", "巴德伊舍尔": "奥地利", "梅尔克": "奥地利", "瓦绍": "奥地利", "库夫斯坦": "奥地利", | |
| # 捷克 | |
| "布拉格": "捷克", "布尔诺": "捷克", "俄斯特拉发": "捷克", "比尔森": "捷克", "奥洛穆茨": "捷克", | |
| "利贝雷茨": "捷克", "赫拉德茨克拉洛韦": "捷克", "乌斯季": "捷克", "帕尔杜比采": "捷克", "兹林": "捷克", | |
| "哈维若夫": "捷克", "克拉德诺": "捷克", "切斯凯布杰约维采": "捷克", "莫斯特": "捷克", "卡尔维纳": "捷克", | |
| "库特纳霍拉": "捷克", "泰尔奇": "捷克", "克鲁姆洛夫": "捷克", "卡尔什特因": "捷克", "布拉格城堡": "捷克", | |
| # 匈牙利 | |
| "布达佩斯": "匈牙利", "德布勒森": "匈牙利", "塞格德": "匈牙利", "米什科尔茨": "匈牙利", "佩奇": "匈牙利", | |
| "焦尔": "匈牙利", "尼赖吉哈佐": "匈牙利", "凯奇凯梅特": "匈牙利", "塞克什白堡": "匈牙利", "松博特海伊": "匈牙利", | |
| "松博特海伊": "匈牙利", "维斯普雷姆": "匈牙利", "埃格尔": "匈牙利", "贝凯什乔包": "匈牙利", "大沃拉丁": "匈牙利", | |
| "埃斯泰尔戈姆": "匈牙利", "维谢格拉德": "匈牙利", "霍洛克": "匈牙利", "蒂豪尼": "匈牙利", "巴拉顿湖": "匈牙利", | |
| # 波兰 | |
| "华沙": "波兰", "克拉科夫": "波兰", "罗兹": "波兰", "弗罗茨瓦夫": "波兰", "波兹南": "波兰", | |
| "格但斯克": "波兰", "什切青": "波兰", "比得哥什": "波兰", "卢布林": "波兰", "卡托维兹": "波兰", | |
| "白雅斯托克": "波兰", "格丁尼亚": "波兰", "琴斯托霍瓦": "波兰", "拉多姆": "波兰", "索斯诺维茨": "波兰", | |
| "托伦": "波兰", "基尔采": "波兰", "格利维采": "波兰", "扎布热": "波兰", "比托姆": "波兰", | |
| "奥斯威辛": "波兰", "马尔堡": "波兰", "扎科帕内": "波兰", "维利奇卡": "波兰", "弗罗茨瓦夫": "波兰", | |
| # 斯洛伐克 | |
| "布拉迪斯拉发": "斯洛伐克", "科希策": "斯洛伐克", "普雷绍夫": "斯洛伐克", "日利纳": "斯洛伐克", "班斯卡比斯特里察": "斯洛伐克", | |
| "尼特拉": "斯洛伐克", "特伦钦": "斯洛伐克", "马丁": "斯洛伐克", "特尔纳瓦": "斯洛伐克", "波普拉德": "斯洛伐克", | |
| "普里维德扎": "斯洛伐克", "兹沃伦": "斯洛伐克", "巴尔代约夫": "斯洛伐克", "列沃恰": "斯洛伐克", "斯皮什斯基堡": "斯洛伐克", | |
| # 斯洛文尼亚 | |
| "卢布尔雅那": "斯洛文尼亚", "马里博尔": "斯洛文尼亚", "采列": "斯洛文尼亚", "克拉尼": "斯洛文尼亚", "韦莱涅": "斯洛文尼亚", | |
| "新戈里察": "斯洛文尼亚", "科佩尔": "斯洛文尼亚", "诺沃梅斯托": "斯洛文尼亚", "卡姆尼克": "斯洛文尼亚", "多姆扎勒": "斯洛文尼亚", | |
| "布莱德": "斯洛文尼亚", "博希尼": "斯洛文尼亚", "皮兰": "斯洛文尼亚", "什科茨扬": "斯洛文尼亚", "波斯托伊纳": "斯洛文尼亚", | |
| # 瑞士 | |
| "苏黎世": "瑞士", "日内瓦": "瑞士", "巴塞尔": "瑞士", "伯尔尼": "瑞士", "洛桑": "瑞士", | |
| "圣加仑": "瑞士", "卢塞恩": "瑞士", "卢加诺": "瑞士", "比尔": "瑞士", "图恩": "瑞士", | |
| "拉绍德封": "瑞士", "沙夫豪森": "瑞士", "弗里堡": "瑞士", "韦维": "瑞士", "拉佩斯": "瑞士", | |
| "因特拉肯": "瑞士", "采尔马特": "瑞士", "格林德瓦": "瑞士", "少女峰": "瑞士", "马特洪峰": "瑞士", | |
| "圣莫里茨": "瑞士", "洛伊克巴德": "瑞士", "安德马特": "瑞士", "文根": "瑞士", "拉克斯": "瑞士", | |
| # === 北欧 === | |
| # 瑞典 | |
| "斯德哥尔摩": "瑞典", "哥德堡": "瑞典", "马尔默": "瑞典", "乌普萨拉": "瑞典", "林雪平": "瑞典", | |
| "韦斯特罗斯": "瑞典", "厄勒布鲁": "瑞典", "北雪平": "瑞典", "赫尔辛堡": "瑞典", "永雪平": "瑞典", | |
| "松兹瓦尔": "瑞典", "于默奥": "瑞典", "韦克舍": "瑞典", "加夫勒": "瑞典", "博罗斯": "瑞典", | |
| "法伦": "瑞典", "卡尔斯塔德": "瑞典", "卡尔马": "瑞典", "维斯比": "瑞典", "基律纳": "瑞典", | |
| # 挪威 | |
| "奥斯陆": "挪威", "卑尔根": "挪威", "特隆赫姆": "挪威", "斯塔万格": "斯洛文尼亚", "克里斯蒂安桑": "挪威", | |
| "腓特烈斯塔": "挪威", "德拉门": "挪威", "谢恩": "挪威", "桑内斯": "挪威", "萨尔普斯堡": "挪威", | |
| "特洛姆瑟": "挪威", "博多": "挪威", "阿尔塔": "挪威", "哈默菲斯特": "挪威", "纳尔维克": "挪威", | |
| "弗洛姆": "挪威", "盖朗厄尔": "挪威", "奥勒松": "挪威", "利勒哈默尔": "挪威", "罗弗敦群岛": "挪威", | |
| # 丹麦 | |
| "哥本哈根": "丹麦", "奥胡斯": "丹麦", "欧登塞": "丹麦", "奥尔堡": "丹麦", "埃斯比约": "丹麦", | |
| "兰德斯": "丹麦", "科尔丁": "丹麦", "赫尔辛格": "丹麦", "马里布": "丹麦", "海勒鲁普": "丹麦", | |
| "比隆": "丹麦", "希勒勒": "丹麦", "罗斯基勒": "丹麦", "斯卡恩": "丹麦", "法尔瑟特": "丹麦", | |
| # 芬兰 | |
| "赫尔辛基": "芬兰", "埃斯波": "芬兰", "坦佩雷": "芬兰", "万塔": "芬兰", "图尔库": "芬兰", | |
| "奥卢": "芬兰", "拉赫蒂": "芬兰", "库奥皮奥": "芬兰", "约恩苏": "芬兰", "约瓦斯屈莱": "芬兰", | |
| "拉彭兰塔": "芬兰", "科特卡": "芬兰", "瓦萨": "芬兰", "弗绍": "芬兰", "海门林纳": "芬兰", | |
| "罗瓦涅米": "芬兰", "凯米": "芬兰", "托尔尼奥": "芬兰", "萨利色尔卡": "芬兰", "伊瓦洛": "芬兰", | |
| # 冰岛 | |
| "雷克雅未克": "冰岛", "科帕沃古尔": "冰岛", "哈夫纳夫约杜尔": "冰岛", "阿克雷里": "冰岛", "雷克雅内斯": "冰岛", | |
| "塞尔福斯": "冰岛", "韦斯特曼纳群岛": "冰岛", "胡萨维克": "冰岛", "埃伊尔斯塔济": "冰岛", "凯夫拉维克": "冰岛", | |
| # === 东欧 === | |
| # 俄罗斯(欧洲部分) | |
| "莫斯科": "俄罗斯", "圣彼得堡": "俄罗斯", "下诺夫哥罗德": "俄罗斯", "喀山": "俄罗斯", "萨马拉": "俄罗斯", | |
| "伏尔加格勒": "俄罗斯", "罗斯托夫": "俄罗斯", "乌法": "俄罗斯", "彭萨": "俄罗斯", "雅罗斯拉夫": "俄罗斯", | |
| "卡卢加": "俄罗斯", "图拉": "俄罗斯", "弗拉基米尔": "俄罗斯", "苏兹达尔": "俄罗斯", "谢尔盖夫": "俄罗斯", | |
| # 乌克兰 | |
| "基辅": "乌克兰", "哈尔科夫": "乌克兰", "敖德萨": "乌克兰", "第聂伯": "乌克兰", "顿涅茨克": "乌克兰", | |
| "扎波罗热": "乌克兰", "利沃夫": "乌克兰", "克里沃罗格": "乌克兰", "尼古拉耶夫": "乌克兰", "马里乌波尔": "乌克兰", | |
| "卢甘斯克": "乌克兰", "文尼察": "乌克兰", "赫尔松": "乌克兰", "切尔卡瑟": "乌克兰", "切尔尼戈夫": "乌克兰", | |
| # 白俄罗斯 | |
| "明斯克": "白俄罗斯", "戈梅利": "白俄罗斯", "莫吉廖夫": "白俄罗斯", "维帖布斯克": "白俄罗斯", "格罗德诺": "白俄罗斯", | |
| "布列斯特": "白俄罗斯", "鲍里索夫": "白俄罗斯", "巴拉诺维奇": "白俄罗斯", "平斯克": "白俄罗斯", "奥尔沙": "白俄罗斯", | |
| # 波罗的海三国 | |
| "里加": "拉脱维亚", "陶格夫匹尔斯": "拉脱维亚", "利耶帕亚": "拉脱维亚", "叶尔加瓦": "拉脱维亚", "文茨皮尔斯": "拉脱维亚", | |
| "塔林": "爱沙尼亚", "塔尔图": "爱沙尼亚", "纳尔瓦": "爱沙尼亚", "帕尔努": "爱沙尼亚", "科赫特拉": "爱沙尼亚", | |
| "维尔纽斯": "立陶宛", "考纳斯": "立陶宛", "克莱佩达": "立陶宛", "希奥利艾": "立陶宛", "帕内韦日斯": "立陶宛", | |
| # 摩尔多瓦 | |
| "基希讷乌": "摩尔多瓦", "蒂拉斯波尔": "摩尔多瓦", "巴尔济": "摩尔多瓦", "本德尔": "摩尔多瓦", "雷布尼察": "摩尔多瓦", | |
| # === 巴尔干半岛 === | |
| # 克罗地亚 | |
| "萨格勒布": "克罗地亚", "斯普利特": "克罗地亚", "里耶卡": "克罗地亚", "奥西耶克": "克罗地亚", "扎达尔": "克罗地亚", | |
| "普拉": "克罗地亚", "杜布罗夫尼克": "克罗地亚", "希贝尼克": "克罗地亚", "卡尔洛瓦茨": "克罗地亚", "瓦拉日丁": "克罗地亚", | |
| "罗维尼": "克罗地亚", "波雷奇": "克罗地亚", "特罗吉尔": "克罗地亚", "赫瓦尔": "克罗地亚", "科尔丘拉": "克罗地亚", | |
| # 塞尔维亚 | |
| "贝尔格莱德": "塞尔维亚", "诺维萨德": "塞尔维亚", "尼什": "塞尔维亚", "克拉古耶瓦茨": "塞尔维亚", "苏博蒂察": "塞尔维亚", | |
| "潘切沃": "塞尔维亚", "泽蒙": "塞尔维亚", "莱斯科瓦茨": "塞尔维亚", "恰恰克": "塞尔维亚", "新帕扎尔": "塞尔维亚", | |
| # 波黑 | |
| "萨拉热窝": "波黑", "巴尼亚卢卡": "波黑", "图兹拉": "波黑", "泽尼察": "波黑", "莫斯塔尔": "波黑", | |
| "比哈奇": "波黑", "布里耶利纳": "波黑", "多博伊": "波黑", "格拉迪什卡": "波黑", "利夫诺": "波黑", | |
| # 黑山 | |
| "波德戈里察": "黑山", "尼克希奇": "黑山", "普里耶波列": "黑山", "比耶洛波列": "黑山", "采蒂涅": "黑山", | |
| "布德瓦": "黑山", "科托尔": "黑山", "乌尔齐尼": "黑山", "赫尔采格诺维": "黑山", "巴尔": "黑山", | |
| # 北马其顿 | |
| "斯科普里": "北马其顿", "库马诺沃": "北马其顿", "比托拉": "北马其顿", "普里莱普": "北马其顿", "特托沃": "北马其顿", | |
| "韦莱斯": "北马其顿", "什蒂普": "北马其顿", "奥赫里德": "北马其顿", "戈斯蒂瓦尔": "北马其顿", "斯特鲁加": "北马其顿", | |
| # 阿尔巴尼亚 | |
| "地拉那": "阿尔巴尼亚", "都拉斯": "阿尔巴尼亚", "埃尔巴桑": "阿尔巴尼亚", "发罗拉": "阿尔巴尼亚", "斯库台": "阿尔巴尼亚", | |
| "科尔察": "阿尔巴尼亚", "卢什涅": "阿尔巴尼亚", "费里": "阿尔巴尼亚", "贝拉特": "阿尔巴尼亚", "吉诺卡斯特": "阿尔巴尼亚", | |
| # 保加利亚 | |
| "索菲亚": "保加利亚", "普罗夫迪夫": "保加利亚", "瓦尔纳": "保加利亚", "布尔加斯": "保加利亚", "鲁塞": "保加利亚", | |
| "斯塔拉扎戈拉": "保加利亚", "普列文": "保加利亚", "슬리문": "保加利亚", "多布里奇": "保加利亚", "舒门": "保加利亚", | |
| "帕扎尔吉克": "保加利亚", "哈斯科沃": "保加利亚", "扬博尔": "保加利亚", "布拉戈耶夫格勒": "保加利亚", "韦利科特尔诺沃": "保加利亚", | |
| # 罗马尼亚 | |
| "布加勒斯特": "罗马尼亚", "克卢日": "罗马尼亚", "蒂米什瓦拉": "罗马尼亚", "雅西": "罗马尼亚", "康斯坦察": "罗马尼亚", | |
| "克拉约瓦": "罗马尼亚", "布拉索夫": "罗马尼亚", "加拉茨": "罗马尼亚", "普洛耶什蒂": "罗马尼亚", "奥拉迪亚": "罗马尼亚", | |
| "布勒伊拉": "罗马尼亚", "阿拉德": "罗马尼亚", "皮特什蒂": "罗马尼亚", "锡比乌": "罗马尼亚", "巴克乌": "罗马尼亚", | |
| "锡纳亚": "罗马尼亚", "布兰": "罗马尼亚", "德古拉城堡": "罗马尼亚", "佩莱什城堡": "罗马尼亚", "马拉穆雷什": "罗马尼亚", | |
| # 土耳其(欧洲部分) | |
| "伊斯坦布尔": "土耳其", "埃迪尔内": "土耳其", "泰基尔达": "土耳其", "克尔克拉雷利": "土耳其", "恰纳卡莱": "土耳其", | |
| # 塞浦路斯 | |
| "尼科西亚": "塞浦路斯", "利马索尔": "塞浦路斯", "拉纳卡": "塞浦路斯", "法马古斯塔": "塞浦路斯", "帕福斯": "塞浦路斯", | |
| "凯里尼亚": "塞浦路斯", "阿依纳帕": "塞浦路斯", "普罗塔拉斯": "塞浦路斯", "特罗多斯": "塞浦路斯", "阿卡马斯": "塞浦路斯", | |
| # 马耳他 | |
| "瓦莱塔": "马耳他", "斯利马": "马耳他", "圣朱利安斯": "马耳他", "姆西达": "马耳他", "维多利亚": "马耳他", | |
| "马尔萨什洛克": "马耳他", "梅利哈": "马耳他", "戈佐": "马耳他", "蓝湖": "马耳他", "姆迪纳": "马耳他", | |
| } | |
| # 欧洲城市别名映射(包含各种表达方式) | |
| self.european_city_aliases = { | |
| # 英文名称映射 | |
| "paris": "巴黎", "rome": "罗马", "london": "伦敦", "berlin": "柏林", | |
| "madrid": "马德里", "barcelona": "巴塞罗那", "vienna": "维也纳", "prague": "布拉格", | |
| "amsterdam": "阿姆斯特丹", "florence": "佛罗伦萨", "venice": "威尼斯", "athens": "雅典", | |
| "budapest": "布达佩斯", "lisbon": "里斯本", "stockholm": "斯德哥尔摩", "copenhagen": "哥本哈根", | |
| "helsinki": "赫尔辛基", "oslo": "奥斯陆", "zurich": "苏黎世", "geneva": "日内瓦", | |
| "munich": "慕尼黑", "milan": "米兰", "naples": "那不勒斯", "nice": "尼斯", | |
| "edinburgh": "爱丁堡", "dublin": "都柏林", "brussels": "布鲁塞尔", "warsaw": "华沙", | |
| "krakow": "克拉科夫", "zagreb": "萨格勒布", "belgrade": "贝尔格莱德", "sofia": "索菲亚", | |
| "bucharest": "布加勒斯特", "kiev": "基辅", "moscow": "莫斯科", "st petersburg": "圣彼得堡", | |
| "reykjavik": "雷克雅未克", "tallinn": "塔林", "riga": "里加", "vilnius": "维尔纽斯", | |
| "bratislava": "布拉迪斯拉发", "ljubljana": "卢布尔雅那", "sarajevo": "萨拉热窝", | |
| "dubrovnik": "杜布罗夫尼克", "split": "斯普利特", "santorini": "圣托里尼", "mykonos": "米科诺斯", | |
| # 中文别名 | |
| "花都": "巴黎", "光之城": "巴黎", "永恒之城": "罗马", "雾都": "伦敦", | |
| "音乐之都": "维也纳", "黄金城市": "布拉格", "千塔之城": "布拉格", | |
| "运河之城": "阿姆斯特丹", "翡冷翠": "佛罗伦萨", "文艺复兴之都": "佛罗伦萨", | |
| "水城": "威尼斯", "西方文明的摇篮": "雅典", "多瑙河明珠": "布达佩斯", | |
| "七丘之城": "里斯本", "北方威尼斯": "斯德哥尔摩", "童话之都": "哥本哈根", | |
| "波罗的海的女儿": "赫尔辛基", "欧洲屋脊": "因特拉肯", "北方雅典": "爱丁堡", | |
| "翡翠岛": "都柏林", "欧洲之都": "布鲁塞尔", "高迪之城": "巴塞罗那", | |
| } | |
| self.chinese_numbers = { | |
| '一': 1, '二': 2, '三': 3, '四': 4, '五': 5, '六': 6, '七': 7, '八': 8, '九': 9, '十': 10, | |
| '两': 2, '半': 0.5, '壹': 1, '贰': 2, '叁': 3, '肆': 4, '伍': 5, '陆': 6, '柒': 7, '捌': 8, '玖': 9, '拾': 10, | |
| # 英文数字 | |
| 'one': 1, 'two': 2, 'three': 3, 'four': 4, 'five': 5, 'six': 6, 'seven': 7, 'eight': 8, 'nine': 9, 'ten': 10, | |
| 'eleven': 11, 'twelve': 12, 'thirteen': 13, 'fourteen': 14, 'fifteen': 15, | |
| # 特殊时长表达 | |
| '半个月': 15, '一个月': 30, '半年': 180, '一年': 365, | |
| '半天': 0.5, '一天': 1, '两天': 2, '三天': 3, '四天': 4, '五天': 5, '六天': 6, '七天': 7, | |
| '八天': 8, '九天': 9, '十天': 10, '半周': 3.5, '一周': 7, '两周': 14, | |
| # 假期相关 | |
| '小长假': 3, '长假': 7, '十一': 7, '国庆': 7, '春节': 7, '五一': 3, '清明': 3, | |
| '端午': 3, '中秋': 3, '元旦': 3, '暑假': 60, '寒假': 30, '周末': 2, '长周末': 3, | |
| # 英文假期 | |
| 'weekend': 2, 'week': 7, 'month': 30, 'vacation': 7, 'holiday': 3 | |
| } | |
| def extract(self, user_message: str,existing_info: dict = None) -> dict: | |
| # 输入验证 | |
| if not user_message or not isinstance(user_message, str): | |
| log.warning("⚠️ 收到无效的用户消息") | |
| return existing_info or {} | |
| if len(user_message.strip()) < 2: | |
| log.warning("⚠️ 用户消息过短,跳过信息提取") | |
| return existing_info or {} | |
| if existing_info: | |
| log.info(f"接收到上下文信息,将在此基础上更新: {existing_info}") | |
| result = copy.deepcopy(existing_info) | |
| else: | |
| result = {} | |
| log.info(f"🛠️ 使用分词策略提取信息:'{user_message[:50]}...'") | |
| # 1. 智能分词 | |
| tokens = self._tokenize_message(user_message) | |
| log.info(f"📝 分词结果:{tokens}") | |
| # 2. 基于分词进行信息提取 | |
| newly_extracted_info = {} | |
| # 提取目的地信息 | |
| destination_info = self._extract_destination_from_tokens(tokens) | |
| if destination_info: | |
| newly_extracted_info["destination"] = destination_info | |
| # 提取时长信息 | |
| duration_info = self._extract_duration_from_tokens(tokens) | |
| if duration_info: | |
| newly_extracted_info["duration"] = duration_info | |
| # 提取预算信息 | |
| budget_info = self._extract_budget_from_tokens(tokens) | |
| if budget_info: | |
| newly_extracted_info["budget"] = budget_info | |
| log.info(f"📊 分词提取结果: {newly_extracted_info}") | |
| return newly_extracted_info | |
| def _merge_info(self, new_info: dict, existing_info: dict) -> dict: | |
| for key, value in new_info.items(): | |
| # 如果新旧信息中同一个键的值都是字典,则递归深入合并 | |
| if isinstance(value, dict) and key in existing_info and isinstance(existing_info[key], dict): | |
| self._merge_info(value, existing_info[key]) | |
| else: | |
| # 否则,直接用新信息覆盖或添加 | |
| existing_info[key] = value | |
| return existing_info | |
| def _tokenize_message(self, text: str) -> list: | |
| """智能分词,支持中英文混合""" | |
| # 预处理:统一标点符号和空格 | |
| text = text.replace(',', ',').replace('。', '.').replace('!', '!').replace('?', '?') | |
| text = text.replace('(', '(').replace(')', ')').replace('【', '[').replace('】', ']') | |
| tokens = [] | |
| current_token = "" | |
| i = 0 | |
| while i < len(text): | |
| char = text[i] | |
| # 处理空格和标点符号 | |
| if char in ' ,,.。!!??()()[]【】::;;': | |
| if current_token: | |
| tokens.append(current_token) | |
| current_token = "" | |
| if char.strip(): # 保留非空格的标点符号 | |
| tokens.append(char) | |
| i += 1 | |
| continue | |
| # 处理数字(包括小数和货币符号) | |
| if char.isdigit() or char in '¥$€£₩': | |
| if current_token and not (current_token[-1].isdigit() or current_token[-1] in '¥$€£₩.'): | |
| tokens.append(current_token) | |
| current_token = char | |
| else: | |
| current_token += char | |
| # 继续读取数字部分 | |
| i += 1 | |
| while i < len(text) and (text[i].isdigit() or text[i] in '.,'): | |
| current_token += text[i] | |
| i += 1 | |
| # 检查货币单位 | |
| currency_units = ['元', '块', '钱', '欧', '美元', '英镑', '日元', '韩元', '瑞郎', 'rmb', 'usd', 'eur', 'gbp', 'jpy', 'krw', 'chf'] | |
| remaining_text = text[i:].lower() | |
| for unit in currency_units: | |
| if remaining_text.startswith(unit): | |
| current_token += text[i:i+len(unit)] | |
| i += len(unit) | |
| break | |
| tokens.append(current_token) | |
| current_token = "" | |
| continue | |
| # 处理英文单词 | |
| if char.isalpha() and ord(char) < 128: # ASCII字符 | |
| if current_token and not current_token[-1].isalpha(): | |
| tokens.append(current_token) | |
| current_token = char | |
| else: | |
| current_token += char | |
| # 继续读取英文字符 | |
| i += 1 | |
| while i < len(text) and text[i].isalpha() and ord(text[i]) < 128: | |
| current_token += text[i] | |
| i += 1 | |
| tokens.append(current_token) | |
| current_token = "" | |
| continue | |
| # 处理中文字符 | |
| if self._is_chinese_char(char): | |
| if current_token and not self._is_chinese_char(current_token[-1]): | |
| tokens.append(current_token) | |
| current_token = "" | |
| # 对于中文,我们需要智能分词 | |
| # 检查是否是多字符城市名、时间表达等 | |
| remaining_text = text[i:] | |
| # 尝试匹配城市名 | |
| matched_city = self._match_city_name(remaining_text) | |
| if matched_city: | |
| tokens.append(matched_city) | |
| i += len(matched_city) | |
| continue | |
| # 尝试匹配时间表达 | |
| matched_time = self._match_time_expression(remaining_text) | |
| if matched_time: | |
| tokens.append(matched_time) | |
| i += len(matched_time) | |
| continue | |
| # 尝试匹配预算类型关键词 | |
| matched_budget_type = self._match_budget_type(remaining_text) | |
| if matched_budget_type: | |
| tokens.append(matched_budget_type) | |
| i += len(matched_budget_type) | |
| continue | |
| # 尝试匹配常见词汇 | |
| matched_word = self._match_common_word(remaining_text) | |
| if matched_word: | |
| tokens.append(matched_word) | |
| i += len(matched_word) | |
| continue | |
| # 单个中文字符 | |
| tokens.append(char) | |
| i += 1 | |
| else: | |
| # 其他字符 | |
| current_token += char | |
| i += 1 | |
| # 处理最后的token | |
| if current_token: | |
| tokens.append(current_token) | |
| # 后处理:合并一些相关的tokens | |
| tokens = self._post_process_tokens(tokens) | |
| return [token for token in tokens if token.strip()] # 过滤空token | |
| def _is_chinese_char(self, char: str) -> bool: | |
| """判断是否为中文字符""" | |
| return '\u4e00' <= char <= '\u9fff' | |
| def _match_city_name(self, text: str) -> str: | |
| """匹配城市名称""" | |
| # 按长度从长到短排序,优先匹配长的城市名 | |
| all_cities = list(self.european_cities.keys()) + list(self.european_city_aliases.keys()) | |
| all_cities = sorted(set(all_cities), key=len, reverse=True) | |
| for city in all_cities: | |
| if text.startswith(city): | |
| return city | |
| return "" | |
| def _match_time_expression(self, text: str) -> str: | |
| """匹配时间表达""" | |
| time_expressions = [ | |
| # 多字符时间表达 | |
| '半个月', '一个月', '两个月', '三个月', '半年', '一年', | |
| '小长假', '长周末', '国庆节', '春节假期', '暑假', '寒假', | |
| '一天半', '两天半', '三天半', '一周半', '两周', | |
| # 英文时间表达 | |
| 'one day', 'two days', 'three days', 'one week', 'two weeks', | |
| 'long weekend', 'vacation', 'holiday', 'spring break' | |
| ] | |
| # 按长度排序,优先匹配长表达 | |
| time_expressions = sorted(time_expressions, key=len, reverse=True) | |
| text_lower = text.lower() | |
| for expr in time_expressions: | |
| if text_lower.startswith(expr.lower()): | |
| return expr | |
| if text.startswith(expr): | |
| return expr | |
| return "" | |
| def _match_budget_type(self, text: str) -> str: | |
| """匹配预算类型关键词""" | |
| budget_keywords = [ | |
| # 经济型 | |
| '经济实惠', '省钱', '便宜', '实惠', '经济', '穷游', '背包客', | |
| '青年旅社', '学生', '预算有限', '性价比', | |
| # 舒适型 | |
| '舒适', '中等', '适中', '标准', '普通', '中档', '合理', | |
| # 豪华型 | |
| '豪华', '奢华', '高端', '顶级', '精品', '五星', '不差钱', | |
| '任性', '土豪', 'VIP', '贵族', '皇家' | |
| ] | |
| # 按长度排序 | |
| budget_keywords = sorted(budget_keywords, key=len, reverse=True) | |
| for keyword in budget_keywords: | |
| if text.startswith(keyword): | |
| return keyword | |
| return "" | |
| def _match_common_word(self, text: str) -> str: | |
| """匹配常见词汇""" | |
| common_words = [ | |
| # 旅行相关动词 | |
| '想去', '计划去', '打算去', '准备去', '希望去', '考虑去', | |
| '前往', '旅行', '旅游', '游玩', '度假', '出发', '飞往', | |
| # 时间相关 | |
| '三天', '四天', '五天', '六天', '七天', '八天', '九天', '十天', | |
| '一天', '两天', '几天', '多天', '数天', | |
| # 预算相关 | |
| '预算', '花费', '费用', '成本', '开销', '支出', '消费', | |
| '总共', '一共', '大概', '约', '左右', '差不多', | |
| # 其他 | |
| '行程', '计划', '安排', '路线', '攻略' | |
| ] | |
| # 按长度排序 | |
| common_words = sorted(common_words, key=len, reverse=True) | |
| for word in common_words: | |
| if text.startswith(word): | |
| return word | |
| return "" | |
| def _post_process_tokens(self, tokens: list) -> list: | |
| """后处理tokens,合并相关的片段""" | |
| if not tokens: | |
| return tokens | |
| processed = [] | |
| i = 0 | |
| while i < len(tokens): | |
| current_token = tokens[i] | |
| # 合并数字+单位的组合 | |
| if i < len(tokens) - 1: | |
| next_token = tokens[i + 1] | |
| # 数字 + 货币单位 | |
| if (current_token.isdigit() and | |
| next_token.lower() in ['元', '块', '钱', '欧', '美元', '英镑', '日元', 'rmb', 'usd', 'eur', 'gbp', 'jpy']): | |
| processed.append(current_token + next_token) | |
| i += 2 | |
| continue | |
| # 数字 + 时间单位 | |
| if (current_token.isdigit() and | |
| next_token in ['天', '日', '周', '月', '年', 'days', 'weeks', 'months']): | |
| processed.append(current_token + next_token) | |
| i += 2 | |
| continue | |
| # 预算 + 数字 | |
| if current_token == '预算' and next_token.replace('.', '').replace(',', '').isdigit(): | |
| if i < len(tokens) - 2 and tokens[i + 2] in ['元', '块', '钱', '欧', 'rmb', 'usd', 'eur']: | |
| processed.append(current_token + next_token + tokens[i + 2]) | |
| i += 3 | |
| continue | |
| else: | |
| processed.append(current_token + next_token) | |
| i += 2 | |
| continue | |
| processed.append(current_token) | |
| i += 1 | |
| return processed | |
| def _extract_destination_from_tokens(self, tokens: list) -> dict: | |
| """从tokens中提取目的地信息""" | |
| result = {} | |
| # 查找城市名 | |
| for i, token in enumerate(tokens): | |
| # 直接匹配城市名 | |
| city_name = self._normalize_city_name(token) | |
| if city_name: | |
| result["name"] = city_name | |
| if city_name in self.european_cities: | |
| result["country"] = self.european_cities[city_name] | |
| break | |
| # 检查是否在动词后面 | |
| if i > 0: | |
| prev_token = tokens[i - 1] | |
| if prev_token in ['去', '到', '想去', '前往', '旅行', '游', '玩', 'go', 'to', 'visit', 'travel']: | |
| city_name = self._normalize_city_name(token) | |
| if city_name: | |
| result["name"] = city_name | |
| if city_name in self.european_cities: | |
| result["country"] = self.european_cities[city_name] | |
| break | |
| # 如果没有找到,尝试fuzzy匹配 | |
| if not result: | |
| for token in tokens: | |
| if len(token) >= 2: | |
| # 模糊匹配城市名 | |
| for city, country in self.european_cities.items(): | |
| if token in city or city in token: | |
| if len(token) >= len(city) * 0.6: # 相似度阈值 | |
| result["name"] = city | |
| result["country"] = country | |
| break | |
| if result: | |
| break | |
| return result | |
| def _normalize_city_name(self, token: str) -> str: | |
| """标准化城市名称""" | |
| if not token: | |
| return "" | |
| token_lower = token.lower().strip() | |
| # 直接匹配 | |
| if token in self.european_cities: | |
| return token | |
| # 别名匹配 | |
| if token_lower in self.european_city_aliases: | |
| return self.european_city_aliases[token_lower] | |
| if token in self.european_city_aliases: | |
| return self.european_city_aliases[token] | |
| return "" | |
| def _extract_duration_from_tokens(self, tokens: list) -> dict: | |
| """从tokens中提取时长信息""" | |
| result = {} | |
| for i, token in enumerate(tokens): | |
| days = None | |
| description = "" | |
| # 处理 "数字+天" 的token | |
| if re.match(r'^\d+[天日]$', token): | |
| days = int(re.findall(r'\d+', token)[0]) | |
| # 处理 "数字+weeks/days" 的token | |
| elif re.match(r'^\d+(days?|weeks?|months?)$', token.lower()): | |
| number = int(re.findall(r'\d+', token)[0]) | |
| unit = re.findall(r'[a-zA-Z]+', token.lower())[0] | |
| if unit.startswith('day'): | |
| days = number | |
| elif unit.startswith('week'): | |
| days = number * 7 | |
| elif unit.startswith('month'): | |
| days = number * 30 | |
| # 处理分离的数字和单位 | |
| elif token.isdigit() and i < len(tokens) - 1: | |
| next_token = tokens[i + 1] | |
| number = int(token) | |
| if next_token in ['天', '日']: | |
| days = number | |
| elif next_token in ['周', '星期', '礼拜', 'week', 'weeks']: | |
| days = number * 7 | |
| elif next_token in ['月', '个月', 'month', 'months']: | |
| days = number * 30 | |
| # 处理中文数字 | |
| elif token in self.chinese_numbers: | |
| days = self.chinese_numbers[token] | |
| description = token | |
| # 处理特殊时长表达 | |
| elif token in ['周末', 'weekend']: | |
| days = 2 | |
| description = token | |
| elif token in ['长周末', 'long weekend']: | |
| days = 3 | |
| description = token | |
| elif token in ['小长假', 'vacation', 'holiday']: | |
| days = 3 | |
| description = token | |
| elif token in ['十一', '国庆', 'national day']: | |
| days = 7 | |
| description = token | |
| elif token in ['春节', 'spring festival']: | |
| days = 7 | |
| description = token | |
| elif token in ['暑假', 'summer vacation']: | |
| days = 60 | |
| description = token | |
| elif token in ['寒假', 'winter vacation']: | |
| days = 30 | |
| description = token | |
| # 处理复合表达 "三天两夜" | |
| elif re.match(r'^[一二三四五六七八九十\d]+天', token): | |
| # 提取数字部分 | |
| for num_token in ['一', '二', '三', '四', '五', '六', '七', '八', '九', '十']: | |
| if token.startswith(num_token): | |
| days = self.chinese_numbers[num_token] | |
| description = token | |
| break | |
| if not days and token[0].isdigit(): | |
| days = int(token[0]) | |
| description = token | |
| # 验证天数合理性并设置结果 | |
| if days and 0.5 <= days <= 365: | |
| result["days"] = int(days) if days >= 1 else days | |
| if not description: | |
| # 添加描述信息 | |
| if days <= 1: | |
| description = "当日往返" | |
| elif days <= 3: | |
| description = "短途旅行" | |
| elif days <= 7: | |
| description = "一周内旅行" | |
| elif days <= 14: | |
| description = "中长途旅行" | |
| elif days <= 30: | |
| description = "长途旅行" | |
| else: | |
| description = "超长途旅行" | |
| result["description"] = description | |
| break | |
| return result | |
| def _extract_budget_from_tokens(self, tokens: list) -> dict: | |
| """从tokens中提取预算信息""" | |
| result = {} | |
| # 1. 查找金额 | |
| for i, token in enumerate(tokens): | |
| amount = None | |
| currency = "RMB" # 默认货币 | |
| # 处理包含货币的token "2000欧", "5000元" | |
| currency_patterns = [ | |
| (r'(\d+(?:\.\d+)?)欧(?:元)?', 'EUR'), | |
| (r'(\d+(?:\.\d+)?)元', 'RMB'), | |
| (r'(\d+(?:\.\d+)?)块(?:钱)?', 'RMB'), | |
| (r'(\d+(?:\.\d+)?)人民币', 'RMB'), | |
| (r'(\d+(?:\.\d+)?)美元', 'USD'), | |
| (r'(\d+(?:\.\d+)?)英镑', 'GBP'), | |
| (r'(\d+(?:\.\d+)?)瑞(?:士)?法郎', 'CHF'), | |
| (r'(\d+(?:\.\d+)?)日元', 'JPY'), | |
| (r'(\d+(?:\.\d+)?)韩元', 'KRW'), | |
| (r'¥(\d+(?:\.\d+)?)', 'RMB'), | |
| (r'€(\d+(?:\.\d+)?)', 'EUR'), | |
| (r'\$(\d+(?:\.\d+)?)', 'USD'), | |
| (r'£(\d+(?:\.\d+)?)', 'GBP'), | |
| (r'(\d+(?:\.\d+)?)rmb', 'RMB'), | |
| (r'(\d+(?:\.\d+)?)usd', 'USD'), | |
| (r'(\d+(?:\.\d+)?)eur', 'EUR'), | |
| (r'(\d+(?:\.\d+)?)gbp', 'GBP'), | |
| (r'(\d+(?:\.\d+)?)chf', 'CHF'), | |
| ] | |
| for pattern, curr in currency_patterns: | |
| match = re.search(pattern, token.lower()) | |
| if match: | |
| amount = float(match.group(1)) | |
| currency = curr | |
| break | |
| # 处理纯数字token(需要查看上下文) | |
| if not amount and re.match(r'^\d+(?:\.\d+)?$', token): | |
| number = float(token) | |
| # 检查前面的token是否有预算相关词汇 | |
| budget_indicators = ['预算', '花费', '费用', '成本', '开销', '支出', '总共', '一共', 'budget', 'cost', 'spend'] | |
| has_budget_context = False | |
| if i > 0 and tokens[i-1] in budget_indicators: | |
| has_budget_context = True | |
| elif i > 1 and tokens[i-2] in budget_indicators: | |
| has_budget_context = True | |
| # 检查后面是否有货币单位 | |
| if i < len(tokens) - 1: | |
| next_token = tokens[i + 1].lower() | |
| currency_units = { | |
| '元': 'RMB', '块': 'RMB', '钱': 'RMB', '人民币': 'RMB', | |
| '欧': 'EUR', '欧元': 'EUR', '美元': 'USD', '英镑': 'GBP', | |
| '瑞郎': 'CHF', '日元': 'JPY', '韩元': 'KRW', | |
| 'rmb': 'RMB', 'usd': 'USD', 'eur': 'EUR', 'gbp': 'GBP', 'chf': 'CHF' | |
| } | |
| if next_token in currency_units: | |
| amount = number | |
| currency = currency_units[next_token] | |
| has_budget_context = True | |
| # 如果有预算上下文但没有明确货币单位,根据数字大小推断 | |
| if has_budget_context and not amount: | |
| if number < 100: # 可能是欧元或美元 | |
| # 查看是否有欧洲城市上下文 | |
| has_european_context = any(self._normalize_city_name(t) for t in tokens) | |
| if has_european_context: | |
| currency = 'EUR' | |
| else: | |
| currency = 'USD' | |
| else: | |
| currency = 'RMB' # 大数字更可能是人民币 | |
| amount = number | |
| # 处理万、千等单位 | |
| if amount: | |
| # 检查是否有万、千修饰符 | |
| if i > 0: | |
| prev_token = tokens[i-1] | |
| if '万' in prev_token or 'w' in prev_token.lower(): | |
| amount *= 10000 | |
| elif '千' in prev_token or 'k' in prev_token.lower(): | |
| amount *= 1000 | |
| elif i < len(tokens) - 1: | |
| next_token = tokens[i+1] | |
| if '万' in next_token or 'w' in next_token.lower(): | |
| amount *= 10000 | |
| elif '千' in next_token or 'k' in next_token.lower(): | |
| amount *= 1000 | |
| if amount > 0: | |
| result["amount"] = int(amount) | |
| result["currency"] = currency | |
| break | |
| # 2. 查找预算类型 | |
| budget_type_keywords = { | |
| 'economy': [ | |
| '经济', '便宜', '省钱', '实惠', '节省', '穷游', '学生', '青年', | |
| '预算有限', '钱不多', '不贵', '划算', '性价比', '背包客', | |
| '简单', '基础', '低成本', '节约', 'budget', 'cheap', 'economy', 'affordable' | |
| ], | |
| 'comfortable': [ | |
| '舒适', '中等', '适中', '一般', '标准', '普通', '正常', '常规', | |
| '中档', '中级', '合理', '平均', '中间档次', 'comfortable', 'standard', 'moderate' | |
| ], | |
| 'luxury': [ | |
| '豪华', '奢华', '高端', '顶级', '精品', '奢侈', '贵族', '皇家', | |
| '贵一点', '不差钱', '任性', '土豪', '有钱', '五星', 'VIP', | |
| 'luxury', 'premium', 'high-end', 'expensive', 'fancy' | |
| ] | |
| } | |
| for token in tokens: | |
| token_lower = token.lower() | |
| for budget_type, keywords in budget_type_keywords.items(): | |
| if any(keyword in token_lower for keyword in keywords): | |
| result["type"] = budget_type | |
| # 找到第一个匹配的关键词作为描述 | |
| for keyword in keywords: | |
| if keyword in token_lower: | |
| result["description"] = keyword if len(keyword) > 2 else token | |
| break | |
| break | |
| if result.get("type"): | |
| break | |
| # 3. 如果有金额但没有类型,根据金额推断类型 | |
| if result.get("amount") and not result.get("type"): | |
| amount = result["amount"] | |
| currency = result.get("currency", "RMB") | |
| # 根据欧洲旅行成本设置阈值 | |
| if currency == "EUR": | |
| if amount < 1500: # 总预算 | |
| result["type"] = "economy" | |
| result["description"] = "经济预算" | |
| elif amount < 4000: | |
| result["type"] = "comfortable" | |
| result["description"] = "舒适预算" | |
| else: | |
| result["type"] = "luxury" | |
| result["description"] = "豪华预算" | |
| elif currency == "USD": | |
| if amount < 2000: | |
| result["type"] = "economy" | |
| result["description"] = "经济预算" | |
| elif amount < 5000: | |
| result["type"] = "comfortable" | |
| result["description"] = "舒适预算" | |
| else: | |
| result["type"] = "luxury" | |
| result["description"] = "豪华预算" | |
| elif currency == "RMB": | |
| if amount < 8000: | |
| result["type"] = "economy" | |
| result["description"] = "经济预算" | |
| elif amount < 20000: | |
| result["type"] = "comfortable" | |
| result["description"] = "舒适预算" | |
| else: | |
| result["type"] = "luxury" | |
| result["description"] = "豪华预算" | |
| # 4. 处理中文数字金额 | |
| chinese_money_mapping = { | |
| '一千': 1000, '两千': 2000, '三千': 3000, '四千': 4000, '五千': 5000, | |
| '六千': 6000, '七千': 7000, '八千': 8000, '九千': 9000, | |
| '一万': 10000, '两万': 20000, '三万': 30000, '四万': 40000, '五万': 50000 | |
| } | |
| if not result.get("amount"): | |
| for token in tokens: | |
| if token in chinese_money_mapping: | |
| result["amount"] = chinese_money_mapping[token] | |
| result["currency"] = "RMB" | |
| break | |
| return result | |
| # 保持向后兼容的验证方法 | |
| def _validate_and_normalize(self, data: dict) -> dict: | |
| """验证和规范化数据""" | |
| return data |