123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372 |
- package com.skyversation.poiaddr.util;
- import com.skyversation.poiaddr.entity.AddrBean;
- import org.springframework.stereotype.Service;
- import javax.annotation.PostConstruct;
- import java.io.InputStream;
- import java.util.*;
- import java.util.regex.Matcher;
- import java.util.regex.Pattern;
- /**
- * # 生成完整的上海市县乡记录.xlsx
- * * 1、读取村居边界.geojson\得到对应关系【所属区:区代码:所属街:居委_1】,同时得到区和区代码的对应关系
- * * 2、遍历xlsx文件列表,然后解析returnAddress,进行分词
- * * 3、主要是获取街镇和居委的对应关系【判断到街镇后,得到下标,然后判断后面是否存在居委会或村委会关键字】
- * 当前版本:V2.0.1
- */
- @Service
- public class AddrSplitLmrMap {
- // 上海市村居边界geojson文件地址
- private static String All_no_SHFilePath = "全国省市县记录.xlsx";
- private static String outPutFilePath = "geojson/上海市_村居边界.xlsx";
- // 《区—街镇-居委》的对应关系
- private static HashMap<String, HashMap<String, Set<String>>> D_S_C_tree = new HashMap<>();
- // 非上海的《省-市-区》的对应关系
- private static HashMap<String, HashMap<String, Set<String>>> All_NO_SH_tree = new HashMap<>();
- // 区和区代码的对应关系
- private static HashMap<String, String> districtCodeMap = new HashMap<>();
- @PostConstruct
- private void initFile() {
- System.out.println("开始初始化分词器");
- InputStream is = ShanghaiAddressSplitUtil.class.getResourceAsStream(outPutFilePath);
- if (is == null) is = ShanghaiAddressSplitUtil.class.getResourceAsStream("/" + outPutFilePath);
- if (is == null) throw new RuntimeException("无法找到" + outPutFilePath);
- try {
- for (Map<String, Object> row : ExcelReaderUtils.readExcel(is)) {
- districtCodeMap.put(row.get("区").toString(), row.get("区代码").toString());
- if (D_S_C_tree.containsKey(row.get("区").toString())) {
- Map<String, Set<String>> SCT = D_S_C_tree.get(row.get("区").toString());
- if (SCT.containsKey(row.get("镇").toString())) {
- SCT.get(row.get("镇").toString()).add(row.get("居委").toString());
- } else {
- Set<String> CL = new HashSet<>();
- CL.add(row.get("居委").toString());
- SCT.put(row.get("镇").toString(), CL);
- }
- } else {
- HashMap<String, Set<String>> SCT = new HashMap<>();
- Set<String> CL = new HashSet<>();
- CL.add(row.get("居委").toString());
- SCT.put(row.get("镇").toString(), CL);
- D_S_C_tree.put(row.get("区").toString(), SCT);
- }
- }
- } catch (Exception e) {
- e.printStackTrace();
- }
- InputStream is2 = ShanghaiAddressSplitUtil.class.getResourceAsStream(All_no_SHFilePath);
- if (is2 == null) is2 = ShanghaiAddressSplitUtil.class.getResourceAsStream("/" + All_no_SHFilePath);
- if (is2 == null) throw new RuntimeException("无法找到" + All_no_SHFilePath);
- try {
- for (Map<String, Object> row : ExcelReaderUtils.readExcel(is2)) {
- String ss = row.get("省份").toString();
- String djs = row.get("地级市") != null && !row.get("地级市").toString().isEmpty() ? row.get("地级市").toString() : null;
- String xjs = row.get("县级市") != null && !row.get("县级市").toString().isEmpty() ? row.get("县级市").toString() : null;
- if (All_NO_SH_tree.containsKey(ss)) {
- Map<String, Set<String>> SCT = All_NO_SH_tree.get(ss);
- if (djs != null && xjs != null) {
- if (SCT.containsKey(djs)) {
- SCT.get(djs).add(xjs);
- } else {
- Set<String> CL = new HashSet<>();
- CL.add(xjs);
- SCT.put(djs, CL);
- }
- }
- } else {
- HashMap<String, Set<String>> SCT = new HashMap<>();
- if (djs != null && xjs != null) {
- Set<String> CL = new HashSet<>();
- CL.add(xjs);
- SCT.put(djs, CL);
- }
- All_NO_SH_tree.put(ss, SCT);
- }
- }
- } catch (Exception e) {
- e.printStackTrace();
- }
- }
- /**
- * ## 分词:返回实体类【原地址:省:市:区:镇:居委:路牌地址:标准地址:区代码】
- * * 1、初始化分词模型(读取村居边界.geojson\得到对应关系【所属区:区代码:所属街:居委_1】)(读取全国省市县记录.xlsx\得到对应关系【省:市:区县】)
- * * 2、判断地址字符串是否存在[省、市、区、镇、街道、县]
- * * 3、如果存在【省、市】判断是否是非上海市,是的话返回rule:0
- * * 4、不知道是不是上海市的话,判断【区、镇】(先全词匹配,匹配不到的话模糊匹配)
- * * 5、将匹配到的“区代码”拼接上310,否则直接是310000,作为搜索条件
- * * 6、得到返回结果列表
- * <p>
- * // 省1:市2:区4:镇8:居委16
- *
- * @param addr
- * @return
- */
- static Pattern pattern = Pattern.compile("市|区|镇|街道|县");
- static Pattern spattern = Pattern.compile("路|街|道|村");
- public static AddrBean outAddrMapInAddr(String addr) {
- AddrBean addrMap = new AddrBean();
- addrMap.setOldAddress(addr + "");
- addrMap.setAddress(addr.replaceAll(" ", ""));
- if (addr.contains("http")) {
- // 先判断是否是链接
- addrMap.setRule("-1");
- } else {
- // 判断外地省名
- boolean errorAddr = false;
- for (String s : All_NO_SH_tree.keySet()) {
- if (addr.startsWith(s) || (addr.startsWith(s.substring(0, 2)) && !ifTrueAddr(addr, s.substring(0, 2)))) {
- addrMap.setProvinces(s);
- addrMap.setRule("0");
- if (!s.contains("上海")) {
- errorAddr = true;
- }
- }
- // 判断外地市名
- for (String m : All_NO_SH_tree.get(s).keySet()) {
- if (addr.contains(m) && !addr.contains(m + "场")) {
- addrMap.setProvinces(s);
- addrMap.setMarket(m);
- addrMap.setRule("-2");
- }
- // 判断外地县名
- for (String x : All_NO_SH_tree.get(s).get(m)) {
- if (addr.contains(x) && !addr.contains(x + "场")) {
- addrMap.setDistinguish(x);
- addrMap.setProvinces(s);
- addrMap.setMarket(m);
- addrMap.setRule("-4");
- break;
- }
- }
- }
- }
- // 如果不是外地数据和连接数据的话
- if (!errorAddr) {
- // 上海地址匹配
- if (pattern.matcher(addr).find()) {
- if (addr.startsWith("上海")) {
- addrMap.setProvinces("上海市");
- addrMap.setMarket("上海市");
- addrMap.setRule("2");
- }
- // 匹配区
- boolean ifContains = false;
- // 区匹配标识
- String sh_distinguish = "";
- for (String d : D_S_C_tree.keySet()) {
- if (addr.contains(d) || addr.contains(d.substring(0, 2) + "县")) {
- ifContains = true;
- addrMap.setProvinces("上海市");
- addrMap.setMarket("上海市");
- addrMap.setDistinguish(d);
- sh_distinguish = d;
- addrMap.setRule("4");
- break;
- }
- if (addr.contains(d.substring(0, 2)) && ifTrueAddr(addr, d.substring(0, 2))) {
- addrMap.setProvinces("上海市");
- addrMap.setMarket("上海市");
- addrMap.setDistinguish(d);
- sh_distinguish = d;
- addrMap.setRule("4");
- }
- }
- // 镇匹配
- for (String d : D_S_C_tree.keySet()) {
- for (String s : D_S_C_tree.get(d).keySet()) {
- if (addr.contains(s)) {
- addrMap.setProvinces("上海市");
- addrMap.setMarket("上海市");
- addrMap.setDistinguish(d);
- addrMap.setStreetTown(s);
- addrMap.setRule("8");
- break;
- }
- if (addr.contains(s.substring(0, 2)) && ifContains && !sh_distinguish.isEmpty() && sh_distinguish.contains(d)) {
- addrMap.setProvinces("上海市");
- addrMap.setMarket("上海市");
- addrMap.setDistinguish(d);
- addrMap.setStreetTown(s);
- addrMap.setRule("8");
- }
- }
- }
- }
- }
- // 特殊处理逻辑
- if (addrMap.getDistinguish() != null && addrMap.getAddress() != null && addrMap.getDistinguish().contains("松江区") && addrMap.getAddress().contains("工业区")) {
- addrMap.setStreetTown("松江技术开发区");
- if (addrMap.getAddress().split("工业区").length > 1) {
- addrMap.setAddress(addrMap.getAddress().split("工业区")[1]);
- }
- }
- // 输出路牌
- if (addrMap.getProvinces() != null && !addrMap.getProvinces().isEmpty() && addrMap.getAddress().contains(addrMap.getProvinces())) {
- if (addrMap.getAddress().split(addrMap.getProvinces()).length > 1) {
- addrMap.setAddress(addrMap.getAddress().split(addrMap.getProvinces())[1]);
- }
- }
- if (addrMap.getMarket() != null && !addrMap.getMarket().isEmpty() && addrMap.getAddress().contains(addrMap.getMarket())) {
- if (addrMap.getAddress().split(addrMap.getMarket()).length > 1) {
- addrMap.setAddress(addrMap.getAddress().split(addrMap.getMarket())[1]);
- }
- }
- if (addrMap.getDistinguish() != null && !addrMap.getDistinguish().isEmpty()) {
- if (addrMap.getAddress().contains(addrMap.getDistinguish())) {
- if (addrMap.getAddress().split(addrMap.getDistinguish()).length > 1) {
- addrMap.setAddress(addrMap.getAddress().split(addrMap.getDistinguish())[1]);
- }
- } else if (addrMap.getAddress().contains(addrMap.getDistinguish().substring(0, 2) + "县")) {
- if (addrMap.getAddress().split(addrMap.getDistinguish().substring(0, 2) + "县").length > 1) {
- addrMap.setAddress(addrMap.getAddress().split(addrMap.getDistinguish().substring(0, 2) + "县")[1]);
- }
- }
- }
- if (addrMap.getStreetTown() != null && !addrMap.getStreetTown().isEmpty() && addrMap.getAddress().contains(addrMap.getStreetTown())) {
- if (addrMap.getAddress().split(addrMap.getStreetTown()).length > 1) {
- addrMap.setAddress(addrMap.getAddress().split(addrMap.getStreetTown())[1]);
- }
- }
- if (addrMap.getResidentialCommittee() != null && !addrMap.getResidentialCommittee().isEmpty() && addrMap.getAddress().contains(addrMap.getResidentialCommittee())) {
- if (addrMap.getAddress().split(addrMap.getResidentialCommittee()).length > 1) {
- addrMap.setAddress(addrMap.getAddress().split(addrMap.getResidentialCommittee())[1]);
- }
- }
- if (addrMap.getAddress().contains("委会") || addrMap.getAddress().contains("员会")) {
- addrMap.setAddress(addrMap.getAddress().substring(Math.max(addrMap.getAddress().indexOf("委会"), addrMap.getAddress().indexOf("员会")) + 2));
- }
- }
- return addrMap;
- }
- /**
- * 判断是否是名称+路名|街名的格式
- *
- * @param addr
- * @param tagStr
- * @return
- */
- public static boolean ifTrueAddr(String addr, String tagStr) {
- String endStr = addr.substring(addr.indexOf(tagStr) + tagStr.length(), Math.min(addr.length(), addr.indexOf(tagStr) + tagStr.length() + 2));
- if (spattern.matcher(endStr).find()) {
- return true;
- }
- return false;
- }
- /**
- * 解析中文地址为结构化列表
- *
- * @param address 原始地址字符串
- * @return 分词后的列表,按路、弄、号楼、层、室的顺序
- */
- public static List<String> parseAddress(String address) {
- List<String> result = new ArrayList<>(Arrays.asList(null, null, null, null, null));
- if (address == null || address.trim().isEmpty()) {
- return result;
- }
- address = address.trim();
- // 1. 提取路(简化版:直接匹配"弄"之前的所有字符)
- String roadPattern = "(.+?)[弄]";
- Matcher roadMatcher = Pattern.compile(roadPattern).matcher(address);
- if (roadMatcher.find() && roadMatcher.start() == 0) {
- result.set(0, roadMatcher.group(1));
- address = address.substring(roadMatcher.end() - 1); // 从"弄"之后开始截取
- } else {
- // 如果没有找到"弄",则尝试使用原来的路名匹配逻辑
- roadPattern = "(.+?[路街道路巷弄])";
- roadMatcher = Pattern.compile(roadPattern).matcher(address);
- if (roadMatcher.find() && roadMatcher.start() == 0) {
- result.set(0, roadMatcher.group(1));
- address = address.substring(roadMatcher.end());
- }
- }
- // 2. 提取弄(支持连续的弄和支弄)
- String lanePattern = "([0-9一二三四五六七八九十百千]+[弄支弄]+)";
- Matcher laneMatcher = Pattern.compile(lanePattern).matcher(address);
- StringBuilder laneBuilder = new StringBuilder();
- while (laneMatcher.find() && laneMatcher.start() == 0) {
- laneBuilder.append(laneMatcher.group(1));
- address = address.substring(laneMatcher.end());
- laneMatcher = Pattern.compile(lanePattern).matcher(address);
- }
- if (laneBuilder.length() > 0) {
- result.set(1, laneBuilder.toString());
- }
- // 3. 提取号楼(支持XX号格式)
- String buildingPattern = "([0-9一二三四五六七八九十百千]+[号楼栋号])";
- Matcher buildingMatcher = Pattern.compile(buildingPattern).matcher(address);
- if (buildingMatcher.find() && buildingMatcher.start() == 0) {
- result.set(2, buildingMatcher.group(1));
- address = address.substring(buildingMatcher.end());
- }
- // 智能楼层室号解析
- String roomPattern = "([0-9]{1,2})([0-9]{2,})[室房]"; // 修改正则表达式,确保室号部分至少两位数
- Matcher roomMatcher = Pattern.compile(roomPattern).matcher(address);
- if (roomMatcher.find()) {
- String floorPart = roomMatcher.group(1);
- String roomPart = roomMatcher.group(2);
- // 设置楼层
- result.set(3, floorPart + "层");
- // 设置室号(直接使用匹配到的部分,不去除前导零)
- result.set(4, floorPart + roomPart + "室"); // 修改此处,直接使用roomPart
- // 移除已匹配的部分
- address = address.substring(0, roomMatcher.start()) +
- address.substring(roomMatcher.end());
- } else {
- // 4. 提取层
- String floorPattern = "([0-9一二三四五六七八九十百千]+[层楼])";
- Matcher floorMatcher = Pattern.compile(floorPattern).matcher(address);
- if (floorMatcher.find()) {
- result.set(3, floorMatcher.group(1));
- address = address.substring(0, floorMatcher.start()) + address.substring(floorMatcher.end());
- }
- // 5. 提取室
- String roomPatternSimple = "([0-9]+[室房])";
- Matcher roomMatcherSimple = Pattern.compile(roomPatternSimple).matcher(address);
- if (roomMatcherSimple.find()) {
- result.set(4, roomMatcherSimple.group(1));
- }
- }
- return result;
- }
- public static void main(String[] args) {
- /*AddrSplitLmrMap AddrSplitLmrMap = new AddrSplitLmrMap();
- AddrSplitLmrMap.initFile();
- System.out.println(outAddrMapInAddr("村165号"));
- System.out.println(outAddrMapInAddr("上海市松江区乐都路"));
- System.out.println(outAddrMapInAddr("云南省昭通市昭阳区永丰镇绿荫社区居民委员会管湾村二十五组205号"));*/
- // 测试示例(包含所有典型场景)
- // 测试一位数楼层地址
- String address4 = "广富林1188弄167号313室";
- System.out.println("\n测试地址: " + address4);
- printParsedResult(parseAddress(address4));
- }
- private static void printParsedResult(List<String> parsed) {
- System.out.println("解析结果:");
- System.out.println("路: " + parsed.get(0));
- System.out.println("弄: " + parsed.get(1));
- System.out.println("号楼: " + parsed.get(2));
- System.out.println("层: " + parsed.get(3));
- System.out.println("室: " + parsed.get(4));
- }
- }
|