package com.skyversation.poiaddr.util; import com.skyversation.poiaddr.entity.AddrBean; import org.springframework.stereotype.Service; import javax.annotation.PostConstruct; import java.io.InputStream; import java.util.*; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * # 生成完整的上海市县乡记录.xlsx * * 1、读取村居边界.geojson\得到对应关系【所属区:区代码:所属街:居委_1】,同时得到区和区代码的对应关系 * * 2、遍历xlsx文件列表,然后解析returnAddress,进行分词 * * 3、主要是获取街镇和居委的对应关系【判断到街镇后,得到下标,然后判断后面是否存在居委会或村委会关键字】 * 当前版本:V2.0.1 */ @Service public class AddrSplitLmrMap { // 上海市村居边界geojson文件地址 private static String All_no_SHFilePath = "全国省市县记录.xlsx"; private static String outPutFilePath = "geojson/上海市_村居边界.xlsx"; // 《区—街镇-居委》的对应关系 private static HashMap>> D_S_C_tree = new HashMap<>(); // 非上海的《省-市-区》的对应关系 private static HashMap>> All_NO_SH_tree = new HashMap<>(); // 区和区代码的对应关系 private static HashMap districtCodeMap = new HashMap<>(); @PostConstruct private void initFile() { System.out.println("开始初始化分词器"); InputStream is = ShanghaiAddressSplitUtil.class.getResourceAsStream(outPutFilePath); if (is == null) is = ShanghaiAddressSplitUtil.class.getResourceAsStream("/" + outPutFilePath); if (is == null) throw new RuntimeException("无法找到" + outPutFilePath); try { for (Map row : ExcelReaderUtils.readExcel(is)) { districtCodeMap.put(row.get("区").toString(), row.get("区代码").toString()); if (D_S_C_tree.containsKey(row.get("区").toString())) { Map> SCT = D_S_C_tree.get(row.get("区").toString()); if (SCT.containsKey(row.get("镇").toString())) { SCT.get(row.get("镇").toString()).add(row.get("居委").toString()); } else { Set CL = new HashSet<>(); CL.add(row.get("居委").toString()); SCT.put(row.get("镇").toString(), CL); } } else { HashMap> SCT = new HashMap<>(); Set CL = new HashSet<>(); CL.add(row.get("居委").toString()); SCT.put(row.get("镇").toString(), CL); D_S_C_tree.put(row.get("区").toString(), SCT); } } } catch (Exception e) { e.printStackTrace(); } InputStream is2 = ShanghaiAddressSplitUtil.class.getResourceAsStream(All_no_SHFilePath); if (is2 == null) is2 = ShanghaiAddressSplitUtil.class.getResourceAsStream("/" + All_no_SHFilePath); if (is2 == null) throw new RuntimeException("无法找到" + All_no_SHFilePath); try { for (Map row : ExcelReaderUtils.readExcel(is2)) { String ss = row.get("省份").toString(); String djs = row.get("地级市") != null && !row.get("地级市").toString().isEmpty() ? row.get("地级市").toString() : null; String xjs = row.get("县级市") != null && !row.get("县级市").toString().isEmpty() ? row.get("县级市").toString() : null; if (All_NO_SH_tree.containsKey(ss)) { Map> SCT = All_NO_SH_tree.get(ss); if (djs != null && xjs != null) { if (SCT.containsKey(djs)) { SCT.get(djs).add(xjs); } else { Set CL = new HashSet<>(); CL.add(xjs); SCT.put(djs, CL); } } } else { HashMap> SCT = new HashMap<>(); if (djs != null && xjs != null) { Set CL = new HashSet<>(); CL.add(xjs); SCT.put(djs, CL); } All_NO_SH_tree.put(ss, SCT); } } } catch (Exception e) { e.printStackTrace(); } } /** * ## 分词:返回实体类【原地址:省:市:区:镇:居委:路牌地址:标准地址:区代码】 * * 1、初始化分词模型(读取村居边界.geojson\得到对应关系【所属区:区代码:所属街:居委_1】)(读取全国省市县记录.xlsx\得到对应关系【省:市:区县】) * * 2、判断地址字符串是否存在[省、市、区、镇、街道、县] * * 3、如果存在【省、市】判断是否是非上海市,是的话返回rule:0 * * 4、不知道是不是上海市的话,判断【区、镇】(先全词匹配,匹配不到的话模糊匹配) * * 5、将匹配到的“区代码”拼接上310,否则直接是310000,作为搜索条件 * * 6、得到返回结果列表 *

* // 省1:市2:区4:镇8:居委16 * * @param addr * @return */ static Pattern pattern = Pattern.compile("市|区|镇|街道|县"); static Pattern spattern = Pattern.compile("路|街|道|村"); public static AddrBean outAddrMapInAddr(String addr) { AddrBean addrMap = new AddrBean(); addrMap.setOldAddress(addr + ""); addrMap.setAddress(addr.replaceAll(" ", "")); if (addr.contains("http")) { // 先判断是否是链接 addrMap.setRule("-1"); } else { // 判断外地省名 boolean errorAddr = false; for (String s : All_NO_SH_tree.keySet()) { if (addr.startsWith(s) || (addr.startsWith(s.substring(0, 2)) && !ifTrueAddr(addr, s.substring(0, 2)))) { addrMap.setProvinces(s); addrMap.setRule("0"); if (!s.contains("上海")) { errorAddr = true; } } // 判断外地市名 for (String m : All_NO_SH_tree.get(s).keySet()) { if (addr.contains(m) && !addr.contains(m + "场")) { addrMap.setProvinces(s); addrMap.setMarket(m); addrMap.setRule("-2"); } // 判断外地县名 for (String x : All_NO_SH_tree.get(s).get(m)) { if (addr.contains(x) && !addr.contains(x + "场")) { addrMap.setDistinguish(x); addrMap.setProvinces(s); addrMap.setMarket(m); addrMap.setRule("-4"); break; } } } } // 如果不是外地数据和连接数据的话 if (!errorAddr) { // 上海地址匹配 if (pattern.matcher(addr).find()) { if (addr.startsWith("上海")) { addrMap.setProvinces("上海市"); addrMap.setMarket("上海市"); addrMap.setRule("2"); } // 匹配区 boolean ifContains = false; // 区匹配标识 String sh_distinguish = ""; for (String d : D_S_C_tree.keySet()) { if (addr.contains(d) || addr.contains(d.substring(0, 2) + "县")) { ifContains = true; addrMap.setProvinces("上海市"); addrMap.setMarket("上海市"); addrMap.setDistinguish(d); sh_distinguish = d; addrMap.setRule("4"); break; } if (addr.contains(d.substring(0, 2)) && ifTrueAddr(addr, d.substring(0, 2))) { addrMap.setProvinces("上海市"); addrMap.setMarket("上海市"); addrMap.setDistinguish(d); sh_distinguish = d; addrMap.setRule("4"); } } // 镇匹配 for (String d : D_S_C_tree.keySet()) { for (String s : D_S_C_tree.get(d).keySet()) { if (addr.contains(s)) { addrMap.setProvinces("上海市"); addrMap.setMarket("上海市"); addrMap.setDistinguish(d); addrMap.setStreetTown(s); addrMap.setRule("8"); break; } if (addr.contains(s.substring(0, 2)) && ifContains && !sh_distinguish.isEmpty() && sh_distinguish.contains(d)) { addrMap.setProvinces("上海市"); addrMap.setMarket("上海市"); addrMap.setDistinguish(d); addrMap.setStreetTown(s); addrMap.setRule("8"); } } } } } // 特殊处理逻辑 if (addrMap.getDistinguish() != null && addrMap.getAddress() != null && addrMap.getDistinguish().contains("松江区") && addrMap.getAddress().contains("工业区")) { addrMap.setStreetTown("松江技术开发区"); if (addrMap.getAddress().split("工业区").length > 1) { addrMap.setAddress(addrMap.getAddress().split("工业区")[1]); } } // 输出路牌 if (addrMap.getProvinces() != null && !addrMap.getProvinces().isEmpty() && addrMap.getAddress().contains(addrMap.getProvinces())) { if (addrMap.getAddress().split(addrMap.getProvinces()).length > 1) { addrMap.setAddress(addrMap.getAddress().split(addrMap.getProvinces())[1]); } } if (addrMap.getMarket() != null && !addrMap.getMarket().isEmpty() && addrMap.getAddress().contains(addrMap.getMarket())) { if (addrMap.getAddress().split(addrMap.getMarket()).length > 1) { addrMap.setAddress(addrMap.getAddress().split(addrMap.getMarket())[1]); } } if (addrMap.getDistinguish() != null && !addrMap.getDistinguish().isEmpty()) { if (addrMap.getAddress().contains(addrMap.getDistinguish())) { if (addrMap.getAddress().split(addrMap.getDistinguish()).length > 1) { addrMap.setAddress(addrMap.getAddress().split(addrMap.getDistinguish())[1]); } } else if (addrMap.getAddress().contains(addrMap.getDistinguish().substring(0, 2) + "县")) { if (addrMap.getAddress().split(addrMap.getDistinguish().substring(0, 2) + "县").length > 1) { addrMap.setAddress(addrMap.getAddress().split(addrMap.getDistinguish().substring(0, 2) + "县")[1]); } } } if (addrMap.getStreetTown() != null && !addrMap.getStreetTown().isEmpty() && addrMap.getAddress().contains(addrMap.getStreetTown())) { if (addrMap.getAddress().split(addrMap.getStreetTown()).length > 1) { addrMap.setAddress(addrMap.getAddress().split(addrMap.getStreetTown())[1]); } } if (addrMap.getResidentialCommittee() != null && !addrMap.getResidentialCommittee().isEmpty() && addrMap.getAddress().contains(addrMap.getResidentialCommittee())) { if (addrMap.getAddress().split(addrMap.getResidentialCommittee()).length > 1) { addrMap.setAddress(addrMap.getAddress().split(addrMap.getResidentialCommittee())[1]); } } if (addrMap.getAddress().contains("委会") || addrMap.getAddress().contains("员会")) { addrMap.setAddress(addrMap.getAddress().substring(Math.max(addrMap.getAddress().indexOf("委会"), addrMap.getAddress().indexOf("员会")) + 2)); } } return addrMap; } /** * 判断是否是名称+路名|街名的格式 * * @param addr * @param tagStr * @return */ public static boolean ifTrueAddr(String addr, String tagStr) { String endStr = addr.substring(addr.indexOf(tagStr) + tagStr.length(), Math.min(addr.length(), addr.indexOf(tagStr) + tagStr.length() + 2)); if (spattern.matcher(endStr).find()) { return true; } return false; } /** * 解析中文地址为结构化列表 * * @param address 原始地址字符串 * @return 分词后的列表,按路、弄、号楼、层、室的顺序 */ public static List parseAddress(String address) { List result = new ArrayList<>(Arrays.asList(null, null, null, null, null)); if (address == null || address.trim().isEmpty()) { return result; } address = address.trim(); // 1. 提取路(简化版:直接匹配"弄"之前的所有字符) String roadPattern = "(.+?)[弄]"; Matcher roadMatcher = Pattern.compile(roadPattern).matcher(address); if (roadMatcher.find() && roadMatcher.start() == 0) { result.set(0, roadMatcher.group(1)); address = address.substring(roadMatcher.end() - 1); // 从"弄"之后开始截取 } else { // 如果没有找到"弄",则尝试使用原来的路名匹配逻辑 roadPattern = "(.+?[路街道路巷弄])"; roadMatcher = Pattern.compile(roadPattern).matcher(address); if (roadMatcher.find() && roadMatcher.start() == 0) { result.set(0, roadMatcher.group(1)); address = address.substring(roadMatcher.end()); } } // 2. 提取弄(支持连续的弄和支弄) String lanePattern = "([0-9一二三四五六七八九十百千]+[弄支弄]+)"; Matcher laneMatcher = Pattern.compile(lanePattern).matcher(address); StringBuilder laneBuilder = new StringBuilder(); while (laneMatcher.find() && laneMatcher.start() == 0) { laneBuilder.append(laneMatcher.group(1)); address = address.substring(laneMatcher.end()); laneMatcher = Pattern.compile(lanePattern).matcher(address); } if (laneBuilder.length() > 0) { result.set(1, laneBuilder.toString()); } // 3. 提取号楼(支持XX号格式) String buildingPattern = "([0-9一二三四五六七八九十百千]+[号楼栋号])"; Matcher buildingMatcher = Pattern.compile(buildingPattern).matcher(address); if (buildingMatcher.find() && buildingMatcher.start() == 0) { result.set(2, buildingMatcher.group(1)); address = address.substring(buildingMatcher.end()); } // 智能楼层室号解析 String roomPattern = "([0-9]{1,2})([0-9]{2,})[室房]"; // 修改正则表达式,确保室号部分至少两位数 Matcher roomMatcher = Pattern.compile(roomPattern).matcher(address); if (roomMatcher.find()) { String floorPart = roomMatcher.group(1); String roomPart = roomMatcher.group(2); // 设置楼层 result.set(3, floorPart + "层"); // 设置室号(直接使用匹配到的部分,不去除前导零) result.set(4, floorPart + roomPart + "室"); // 修改此处,直接使用roomPart // 移除已匹配的部分 address = address.substring(0, roomMatcher.start()) + address.substring(roomMatcher.end()); } else { // 4. 提取层 String floorPattern = "([0-9一二三四五六七八九十百千]+[层楼])"; Matcher floorMatcher = Pattern.compile(floorPattern).matcher(address); if (floorMatcher.find()) { result.set(3, floorMatcher.group(1)); address = address.substring(0, floorMatcher.start()) + address.substring(floorMatcher.end()); } // 5. 提取室 String roomPatternSimple = "([0-9]+[室房])"; Matcher roomMatcherSimple = Pattern.compile(roomPatternSimple).matcher(address); if (roomMatcherSimple.find()) { result.set(4, roomMatcherSimple.group(1)); } } return result; } public static void main(String[] args) { /*AddrSplitLmrMap AddrSplitLmrMap = new AddrSplitLmrMap(); AddrSplitLmrMap.initFile(); System.out.println(outAddrMapInAddr("村165号")); System.out.println(outAddrMapInAddr("上海市松江区乐都路")); System.out.println(outAddrMapInAddr("云南省昭通市昭阳区永丰镇绿荫社区居民委员会管湾村二十五组205号"));*/ // 测试示例(包含所有典型场景) // 测试一位数楼层地址 String address4 = "广富林1188弄167号313室"; System.out.println("\n测试地址: " + address4); printParsedResult(parseAddress(address4)); } private static void printParsedResult(List parsed) { System.out.println("解析结果:"); System.out.println("路: " + parsed.get(0)); System.out.println("弄: " + parsed.get(1)); System.out.println("号楼: " + parsed.get(2)); System.out.println("层: " + parsed.get(3)); System.out.println("室: " + parsed.get(4)); } }