Răsfoiți Sursa

优化分词器

ximinghao 1 lună în urmă
părinte
comite
8778a602fd

+ 389 - 239
src/main/java/com/skyversation/poiaddr/util/address_spliter/ShanghaiAddressSplitUtil.java

@@ -2,6 +2,7 @@ package com.skyversation.poiaddr.util.address_spliter;
 
 import com.skyversation.poiaddr.util.ExcelReaderUtils;
 import lombok.AllArgsConstructor;
+import lombok.Getter;
 import org.springframework.stereotype.Service;
 
 import javax.annotation.PostConstruct;
@@ -10,11 +11,12 @@ import java.util.*;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;
 import java.util.stream.Collectors;
+import java.util.stream.Stream;
 
 @Service
 public class ShanghaiAddressSplitUtil {
     @AllArgsConstructor
-    static class threeLevelAddress{
+    static class ThreeLevelAddress {
         String district;
         String street;
         String community;
@@ -24,13 +26,39 @@ public class ShanghaiAddressSplitUtil {
         String districtCode;
         String streetCode;
         String communityCode;
+    }
+
+    static class ThreeLevelAddressTree {
+        private static class node {
+            node parent;
+            List<node> children;
+            ThreeLevelAddress address;
+        }
+
+        List<node> rootNodes;
+    }
 
+    @Getter
+    static class AddressPart {
+        String address;
+        int location;
+        boolean completeMatch;
+
+        AddressPart(String address, int location) {
+            this.address = address;
+            this.location = location;
+        }
+
+        void matchCompete() {
+            completeMatch = true;
+        }
     }
-    private static Map<String,List<threeLevelAddress>> All_STREET_IN_SHANGHAI;
-    private static Map<String,List<threeLevelAddress>> All_COMMUNITY_IN_SHANGHAI;
-    private static Map<String,List<String>> DISTRICT_TO_STREET_MAP;
-    private static Map<String,List<String>> STREET_TO_COMMUNITY_MAP;
-    private static Map<String,List<String>> DISTRICT_TO_COMMUNITY_MAP;
+
+    private static Map<String, List<ThreeLevelAddress>> All_STREET_IN_SHANGHAI;
+    private static Map<String, List<ThreeLevelAddress>> All_COMMUNITY_IN_SHANGHAI;
+    private static Map<String, List<String>> DISTRICT_TO_STREET_MAP;
+    private static Map<String, List<String>> STREET_TO_COMMUNITY_MAP;
+    private static Map<String, List<String>> DISTRICT_TO_COMMUNITY_MAP;
 
     private static final Pattern LEVEL_1_SUFFIX_PATTERN = Pattern.compile("^(?:区|新区)");
 
@@ -42,22 +70,23 @@ public class ShanghaiAddressSplitUtil {
 
     private static final Pattern UN_ADDRESS_PATTERN = Pattern.compile("http");
 
-    private static final Pattern OVER_SPLIT=Pattern.compile("^(?:[0123456789-\\-一二三四五六七八九十大A-za-z]{0,4}[街队组栋号站弄]|(?:车站|工业区|市场|农贸市场)(?![东南西北中一二三四五六七八九十公大小支新老环]路)|[A-za-z]?[0123456789-\\-])");
+    private static final Pattern OVER_SPLIT = Pattern.compile("^(?:[0123456789-\\-一二三四五六七八九十大A-za-z]{0,4}[街队组栋号站弄]|(?:车站|工业区|市场|农贸市场)(?![东南西北中一二三四五六七八九十公大小支新老环]路)|[A-za-z]?[0123456789-\\-])");
 
     private static final Pattern MULTI_ADDRESS = Pattern.compile("(?<=[0-9])[号弄]?[、—/\\\\-][0-9]+(?=[号弄])");
+
     @PostConstruct
-    private void init(){
+    private void init() {
         System.out.println("开始初始化分词器");
-        Map<String,threeLevelAddress> districtMap= new HashMap<>();
-        Map<String,List<threeLevelAddress>> streetMap= new HashMap<>();
-        Map<String,List<threeLevelAddress>> communityMap= new HashMap<>();
-        Map<String,List<String>> districtToStreetMap=new HashMap<>();
-        Map<String,List<String>> streetToCommunityMap=new HashMap<>();
+        Map<String, ThreeLevelAddress> districtMap = new HashMap<>();
+        Map<String, List<ThreeLevelAddress>> streetMap = new HashMap<>();
+        Map<String, List<ThreeLevelAddress>> communityMap = new HashMap<>();
+        Map<String, List<String>> districtToStreetMap = new HashMap<>();
+        Map<String, List<String>> streetToCommunityMap = new HashMap<>();
 
-        String file = "上海市县乡记录.xlsx";
+        String file = "excel/上海市县乡记录.xlsx";
         InputStream is = ShanghaiAddressSplitUtil.class.getResourceAsStream(file);
-        if (is==null) is= ShanghaiAddressSplitUtil.class.getResourceAsStream("/"+file);
-        if (is==null) throw new RuntimeException("无法找到"+file);
+        if (is == null) is = ShanghaiAddressSplitUtil.class.getResourceAsStream("/" + file);
+        if (is == null) throw new RuntimeException("无法找到" + file);
         try {
             for (Map<String, Object> row : ExcelReaderUtils.readExcel(is)) {
                 String district = Optional.ofNullable(row.get("县级市简称")).map(Object::toString).orElse("");
@@ -69,23 +98,25 @@ public class ShanghaiAddressSplitUtil {
                 String districtCode = Optional.ofNullable(row.get("县级市编码")).map(Object::toString).orElse("");
                 String streetCode = Optional.ofNullable(row.get("街道编码")).map(Object::toString).orElse("");
                 String communityCode = Optional.ofNullable(row.get("居委编码")).map(Object::toString).orElse("");
-                initData(district, street, community, districtFullName, streetFullName, communityFullName,districtCode, streetCode, communityCode, districtMap, streetMap, communityMap, districtToStreetMap, streetToCommunityMap);
+                initData(district, street, community, districtFullName, streetFullName, communityFullName, districtCode, streetCode, communityCode, districtMap, streetMap, communityMap, districtToStreetMap, streetToCommunityMap);
             }
             //自贸区
-            initData("浦东",  "试验区","", "浦东新区", "自由贸易试验区","","310115","","",  districtMap, streetMap, communityMap, districtToStreetMap, streetToCommunityMap);
+            initData("浦东", "试验区", "", "浦东新区", "自由贸易试验区", "", "310115", "", "", districtMap, streetMap, communityMap, districtToStreetMap, streetToCommunityMap);
             //松江镇特别处理
-            initData("松江",  "松江","", "松江区", "","","310117","","",  districtMap, streetMap, communityMap, districtToStreetMap, streetToCommunityMap);
+            initData("松江", "松江", "", "松江区", "", "", "310117", "", "", districtMap, streetMap, communityMap, districtToStreetMap, streetToCommunityMap);
             //金山工业区
-            initData("金山",  "金山工业区","", "金山区", "金山工业区","","310116","","",  districtMap, streetMap, communityMap, districtToStreetMap, streetToCommunityMap);
+            initData("金山", "金山工业区", "", "金山区", "金山工业区", "", "310116", "", "", districtMap, streetMap, communityMap, districtToStreetMap, streetToCommunityMap);
 
         } catch (Exception e) {
             throw new RuntimeException(e);
         }
         All_STREET_IN_SHANGHAI = Collections.unmodifiableMap(streetMap);
         All_COMMUNITY_IN_SHANGHAI = Collections.unmodifiableMap(communityMap);
-        DISTRICT_TO_STREET_MAP=Collections.unmodifiableMap(districtToStreetMap);
-        STREET_TO_COMMUNITY_MAP=Collections.unmodifiableMap(streetToCommunityMap);
-        DISTRICT_TO_COMMUNITY_MAP=Collections.unmodifiableMap(DISTRICT_TO_STREET_MAP.entrySet().stream()
+        DISTRICT_TO_STREET_MAP = Collections.unmodifiableMap(districtToStreetMap.entrySet()
+                .stream().collect(Collectors.toMap(Map.Entry::getKey, e -> e.getValue()
+                        .stream().distinct().collect(Collectors.toList()))));
+        STREET_TO_COMMUNITY_MAP = Collections.unmodifiableMap(streetToCommunityMap);
+        DISTRICT_TO_COMMUNITY_MAP = Collections.unmodifiableMap(DISTRICT_TO_STREET_MAP.entrySet().stream()
                 .collect(Collectors.toMap(
                         Map.Entry::getKey,
                         entry -> entry.getValue().stream()
@@ -95,215 +126,244 @@ public class ShanghaiAddressSplitUtil {
         System.out.println("分词器初始化完成");
     }
 
-    private static void initData(String district, String street, String community, String districtFullName, String streetFullName, String communityFullName,String districtCode, String streetCode, String communityCode, Map<String, threeLevelAddress> districtMap, Map<String, List<threeLevelAddress>> streetMap, Map<String, List<threeLevelAddress>> communityMap, Map<String, List<String>> districtToStreetMap, Map<String, List<String>> streetToCommunityMap) {
-        threeLevelAddress add = new threeLevelAddress(district, street, community, districtFullName, streetFullName, communityFullName, districtCode ,streetCode, communityCode);
-        districtMap.put(district,add);
-        if (!streetMap.containsKey(street)) streetMap.put(street,new ArrayList<>());
+    private static void initData(String district, String street, String community, String districtFullName, String streetFullName, String communityFullName, String districtCode, String streetCode, String communityCode, Map<String, ThreeLevelAddress> districtMap, Map<String, List<ThreeLevelAddress>> streetMap, Map<String, List<ThreeLevelAddress>> communityMap, Map<String, List<String>> districtToStreetMap, Map<String, List<String>> streetToCommunityMap) {
+        ThreeLevelAddress add = new ThreeLevelAddress(district, street, community, districtFullName, streetFullName, communityFullName, districtCode, streetCode, communityCode);
+        districtMap.put(district, add);
+        if (!streetMap.containsKey(street)) streetMap.put(street, new ArrayList<>());
         streetMap.get(street).add(add);
-        if (!communityMap.containsKey(community)) communityMap.put(community,new ArrayList<>());
+        if (!communityMap.containsKey(community)) communityMap.put(community, new ArrayList<>());
         communityMap.get(community).add(add);
-        if (!districtToStreetMap.containsKey(district)) districtToStreetMap.put(district,new ArrayList<>());
+        if (!districtToStreetMap.containsKey(district)) districtToStreetMap.put(district, new ArrayList<>());
         districtToStreetMap.get(district).add(street);
-        if (!streetToCommunityMap.containsKey(street)) streetToCommunityMap.put(street,new ArrayList<>());
+        if (!streetToCommunityMap.containsKey(street)) streetToCommunityMap.put(street, new ArrayList<>());
         streetToCommunityMap.get(street).add(community);
     }
 
-    private static class splittingAddress{
+    private static class splittingAddress {
         SplitAddress splitAddress;
 
-        int street=-1;
-        int community=-1;
+        List<AddressPart> streetParts = new ArrayList<>();
+        List<AddressPart> communityParts = new ArrayList<>();
 
-        Map<Integer,String> streetMap =new HashMap<>();
-        Map<Integer,String> communityMap=new HashMap<>();
-        threeLevelAddress threeLevelAddress;
 
-        String targetString;
+        ThreeLevelAddress threeLevelAddress;
 
-        void findStreet(){
-            Map<Integer,String> results =null;
-            int completeMatchIndex=-1;
-            //首先尝试在一选下匹配
-            if (splitAddress.getDistrict()!=null){
-                results  = contain(this.targetString,DISTRICT_TO_STREET_MAP.get(splitAddress.getDistrict()),0);
-                completeMatchIndex = washResult(this.targetString,results,LEVEL_2_SUFFIX_PATTERN,LEVEL_3_SUFFIX_PATTERN,LEVEL_1_SUFFIX_PATTERN);
-            }
-            //一选不存在或匹配无结果,直接搜全国
-            if (results==null||results.isEmpty()){
-                results = contain(this.targetString,All_STREET_IN_SHANGHAI.keySet(),0);
-                if (completeMatchIndex==-1)completeMatchIndex = washResult(this.targetString,results,LEVEL_2_SUFFIX_PATTERN,LEVEL_3_SUFFIX_PATTERN,LEVEL_1_SUFFIX_PATTERN);
-
-            }
-
-            streetMap.putAll(results);
-            street=completeMatchIndex;
+        String targetString;
+        private AddressPart bestStreet;
+        private AddressPart bestCommunity;
+
+
+        void findStreet() {
+            List<AddressPart> results = null;
+//            int completeMatchIndex = -1;
+//            //首先尝试在一选下匹配
+//            if (splitAddress.getDistrict() != null) {
+//                results = contain(this.targetString, DISTRICT_TO_STREET_MAP.get(splitAddress.getDistrict().replaceAll("新?区", "")), 0);
+//                washResult(this.targetString, results, LEVEL_2_SUFFIX_PATTERN, LEVEL_3_SUFFIX_PATTERN, LEVEL_1_SUFFIX_PATTERN);
+//            }
+//            //一选不存在或匹配无结果,直接搜全国
+//            if (results == null || results.isEmpty()) {
+            results = contain(this.targetString, All_STREET_IN_SHANGHAI.keySet(), 0);
+            washResult(this.targetString, results, LEVEL_2_SUFFIX_PATTERN, LEVEL_3_SUFFIX_PATTERN, LEVEL_1_SUFFIX_PATTERN);
+
+//            }
+            results = results.stream().filter(addressPart -> {
+                int key = addressPart.location;
+                String name = addressPart.address;
+                switch (name) {
+                    case "高桥": {
+                        if (key > 0 && targetString.charAt(key - 1) == '外') {
+                            return false;          // 避免 “外高桥”→“高桥”
+                        }
+                        break;
+                    }
+                    case "莘庄": {
+                        String after = targetString.substring(key + name.length());
+                        if (after.startsWith("工业区")) {
+                            return true;
+                        }
+                        break;
+                    }
+                    case "外滩":{
+                        if (key > 0 && targetString.charAt(key - 1) == '北') {
+                            return false;          // 避免 “北外滩”→“外滩”
+                        }
+                        break;
+                    }
+                }
+                return true;
+            }).collect(Collectors.toList());
+            streetParts.addAll(results);
             //仅有一个选择时当成一选
-            if (streetMap.size()==1){
-                street = (int)streetMap.keySet().toArray()[0];
+            if (streetParts.size() == 1) {
+                streetParts.get(0).matchCompete();
             }
 
         }
 
-        void findCommunity(){
-            Map<Integer,String> results = null;
-            int completeMatchCommunity=-1;
-            String sub=targetString;
+        void findCommunity() {
+            List<AddressPart> results = null;
             //尝试一选
-            if (street!=-1){
-                sub = targetString.substring(street+streetMap.get(street).length());
-                Matcher m = LEVEL_2_SUFFIX_PATTERN.matcher(sub);
-                if (m.find()){
-                    sub=sub.substring(m.end());
-                }
-                results= contain(sub,STREET_TO_COMMUNITY_MAP.get(streetMap.get(street)),targetString.length()-sub.length());
-                completeMatchCommunity=washResult(targetString,results,LEVEL_3_SUFFIX_PATTERN,LEVEL_1_SUFFIX_PATTERN,LEVEL_2_SUFFIX_PATTERN);
-
-            }
-            //一选不存在或匹配无结果,先搜全区
-            if ((results == null || results.isEmpty()) && splitAddress.getDistrict()!=null) {
-                results = contain(sub, DISTRICT_TO_COMMUNITY_MAP.get(splitAddress.getDistrict()),targetString.length()-sub.length());
-                if (completeMatchCommunity==-1)completeMatchCommunity=washResult(targetString,results,LEVEL_3_SUFFIX_PATTERN,LEVEL_1_SUFFIX_PATTERN,LEVEL_2_SUFFIX_PATTERN);
-            }
+//           List<AddressPart> completeStreet = streetParts.stream().filter(AddressPart::isCompleteMatch).collect(Collectors.toList());
+//            if (completeStreet.size() == 1) {
+//                AddressPart completeStreetPart = completeStreet.get(0);
+//                results = contain(targetString, STREET_TO_COMMUNITY_MAP.get(completeStreetPart.address), 0);
+//                washResult(targetString, results, LEVEL_3_SUFFIX_PATTERN, LEVEL_1_SUFFIX_PATTERN, LEVEL_2_SUFFIX_PATTERN);
+//
+//            }
+//            //一选不存在或匹配无结果,先搜全区
+//            if ((results == null || results.isEmpty()) && splitAddress.getDistrict() != null) {
+//                results = contain(targetString, DISTRICT_TO_COMMUNITY_MAP.get(splitAddress.getDistrict().replaceAll("新?区", "")), 0);
+//                washResult(targetString, results, LEVEL_3_SUFFIX_PATTERN, LEVEL_1_SUFFIX_PATTERN, LEVEL_2_SUFFIX_PATTERN);
+//            }
             //最后全市
-            if (results == null || results.isEmpty()) {
-                results = contain(sub, All_COMMUNITY_IN_SHANGHAI.keySet(),targetString.length()-sub.length());
-                if (completeMatchCommunity==-1)completeMatchCommunity=washResult(targetString,results,LEVEL_3_SUFFIX_PATTERN,LEVEL_1_SUFFIX_PATTERN,LEVEL_2_SUFFIX_PATTERN);
-            }
-            Iterator<Integer> iterator = results.keySet().iterator();
-            while (iterator.hasNext()) {
-                int key = iterator.next();
-                String name = results.get(key);
-                if (key > 0 && name.equals("江镇") && targetString.charAt(key - 1) == '松') {
-                    iterator.remove();
-                }
-                if (key > 0 && name.equals("镇江")) {
-                    String sub1 = targetString.substring(key+2);
-                    if (LEVEL_3_SUFFIX_PATTERN.matcher(sub1).matches()) {
-                        iterator.remove();
+//            if (results == null || results.isEmpty()) {
+            results = contain(targetString, All_COMMUNITY_IN_SHANGHAI.keySet(), 0);
+            washResult(targetString, results, LEVEL_3_SUFFIX_PATTERN, LEVEL_1_SUFFIX_PATTERN, LEVEL_2_SUFFIX_PATTERN);
+//            }
+            results = results.stream().filter(addressPart -> {
+                int key = addressPart.location;
+                String name = addressPart.address;
+                switch (name) {
+
+                    case "江镇": {
+                        if (key > 0 && targetString.charAt(key - 1) == '松') {
+                            return false;          // 避免 “松江镇”→“江镇”
+                        }
+                        break;
+                    }
+
+                    case "镇江": {
+                        String after = targetString.substring(key + name.length());
+                        return LEVEL_3_SUFFIX_PATTERN.matcher(after).find();
+                        // 避免 “xx镇江x村” 被拆出 “镇江”
                     }
                 }
-            }
-            communityMap.putAll(results);
+                return true;
+            }).collect(Collectors.toList());
+            communityParts.addAll(results);
             //仅有一个选择时当成一选
-            if (communityMap.size()==1){
-                int index = (int)communityMap.keySet().toArray()[0];
-                if (street!=index)community=index;
+            if (communityParts.size() == 1) {
+                communityParts.get(0).matchCompete();
             }
 
         }
 
-        void matchThreeLevelAdd(){
-            int handingPoint=0;
-            threeLevelAddress handingTLA=new threeLevelAddress("","","","","","","","","");
-            for (String communityName: new HashSet<>(communityMap.values())){
-                if (communityName.isEmpty())continue;
-                for(threeLevelAddress t:All_COMMUNITY_IN_SHANGHAI.get(communityName)){
+        void matchThreeLevelAdd() {
+            int handingPoint = 0;
+            ThreeLevelAddress handingTLA = new ThreeLevelAddress("", "", "", "", "", "", "", "", "");
+            for (AddressPart addressPart : communityParts) {
+                for (ThreeLevelAddress t : All_COMMUNITY_IN_SHANGHAI.get(addressPart.address)) {
                     int point = checkTLA(t);
-                    if (point>handingPoint){
-                        handingPoint=point;
-                        handingTLA=t;
+                    if (point > handingPoint) {
+                        handingPoint = point;
+                        handingTLA = t;
                     }
                 }
             }
-            for (String streetName:new HashSet<>(streetMap.values())){
-                if (streetName.isEmpty())continue;
-                for(threeLevelAddress t:All_STREET_IN_SHANGHAI.get(streetName)){
+            for (AddressPart addressPart : streetParts) {
+                for (ThreeLevelAddress t : All_STREET_IN_SHANGHAI.get(addressPart.address)) {
                     int point = checkTLA(t);
-                    if (point>handingPoint){
-                        handingPoint=point;
-                        handingTLA=t;
+                    if (point > handingPoint) {
+                        handingPoint = point;
+                        handingTLA = t;
                     }
                 }
             }
             threeLevelAddress = handingTLA;
         }
-        int checkTLA(threeLevelAddress t){
-            int output=0;
-            if (t.district.equals(splitAddress.getDistrict()))output+=1;
-            if (streetMap.containsValue(t.street))output+=10;
-            if (street!=-1&&streetMap.get(street).equals(t.street))output+=1000;
-            if (communityMap.containsValue(t.community))output+=100;
-            if (community!=-1&&communityMap.get(community).equals(t.community))output+=1000;
-            if (community!=-1&&Pattern.matches(".*\\d$",communityMap.get(community)))output-=1000;
+
+        int checkTLA(ThreeLevelAddress t) {
+            int output = 0;
+            if (t.district.equals(splitAddress.getDistrict().replaceAll("新?区",""))) output += 1;
+
+            if (streetParts.stream().anyMatch(e -> e.address.equals(t.street))) output += 10;
+            if (streetParts.stream().filter(AddressPart::isCompleteMatch).anyMatch(e -> e.address.equals(t.street)))
+                output += 1000;
+
+            if (communityParts.stream().anyMatch(e -> e.address.equals(t.community))) output += 100;
+            if (communityParts.stream().filter(AddressPart::isCompleteMatch).filter(e -> !e.address.matches(".*\\d$")).anyMatch(e -> e.address.equals(t.community)))
+                output += 1000;
             return output;
         }
-        void guessFirstMatch(){
-            //先街道
-            if (!streetMap.isEmpty()&&street==-1) {
-                for (int i :streetMap.keySet()){
-                    if (streetMap.get(i).equals(threeLevelAddress.street)&&(i<street||street==-1)) {
-                        street=i;
-                    }
-                }
-            }
-            //再居委
-            if (community==-1&& !communityMap.isEmpty()){
-                for (int i :communityMap.keySet()){
-                    if (communityMap.get(i).equals(threeLevelAddress.community)&&street!=i&&(i<community||community==-1)){
-                        community=i;
-                    }
-                }
 
-            }
+        void guessFirstMatch() {
+            // 街道
+            bestStreet = streetParts.stream()
+                    .filter(p -> p.completeMatch)
+                    .filter(p -> p.address.equals(threeLevelAddress.street))
+                    .max(Comparator.comparingInt(p -> p.location)) // 取 location 最大的
+                    .orElseGet(()->streetParts.stream().filter(p -> p.address.equals(threeLevelAddress.street))
+                            .max(Comparator.comparingInt(p -> p.location)).orElse(null));
+            // 居委
+            bestCommunity = communityParts.stream()
+                    .filter(p -> p.completeMatch)
+                    .filter(p -> p.address.equals(threeLevelAddress.community))
+                    .max(Comparator.comparingInt(p -> p.location))
+                    .orElseGet(()->communityParts.stream().filter(p -> p.address.equals(threeLevelAddress.community))
+                            .max(Comparator.comparingInt(p -> p.location)).orElse(null));
 
         }
     }
-    static int washResult(String sourceAddress, Map<Integer, String> result, Pattern should, Pattern... never){
-        Map<Integer,String> output=new HashMap<>();
-        int outputInt = -1;
-        for (int index : result.keySet()) {
-            String name = result.get(index);
-            String sub =sourceAddress.substring(index + name.length());
+
+    static List<AddressPart> washResult(String sourceAddress, List<AddressPart> result, Pattern should, Pattern... never) {
+        List<AddressPart> output = new ArrayList<>();
+        for (AddressPart part : result) {
+            int index = part.location;
+            String name = part.address;
+            String sub = sourceAddress.substring(index + name.length());
             //匹配到后缀时直接保留
             if (should.matcher(sub).find()) {
-                outputInt=index;
+                part.matchCompete();
             } else {
                 //去除南京路,北京大道型选手
                 if (ROAD_SUFFIX_PATTERN.matcher(sub).find()) {
                     continue;
                 }
-                boolean skip =false;
-                for (Pattern p :never){
-                    if (p.matcher(sub).find())skip=true;
+                boolean skip = false;
+                for (Pattern p : never) {
+                    if (p.matcher(sub).find()) skip = true;
                 }
                 if (skip) continue;
             }
-            output.put(index,name);
+            output.add(part);
         }
         result.clear();
-        result.putAll(output);
-        return outputInt;
+        result.addAll(output);
+        return output;
     }
 
     /**
      * 检查字符串含有哪些字符,输出这些匹配字符的位置和字符的map
-     * @param s 被检查字符串
+     *
+     * @param s        被检查字符串
      * @param nameList 检查范围
      */
-    private static Map<Integer,String> contain(String s,Iterable<String> nameList,int offset){
-        Map<Integer,String> output = new HashMap<>();
-        if (nameList==null){
+    private static List<AddressPart> contain(String s, Iterable<String> nameList, int offset) {
+        List<AddressPart> output = new ArrayList<>();
+        if (nameList == null) {
             return output;
         }
-        for (String name:nameList){
-            if (name.isEmpty())continue;
+        for (String name : nameList) {
+            if (name.isEmpty()) continue;
             int index = -1;
-            while ((index = s.indexOf(name, index + 1)) != -1){
-                output.put(index+offset,name);
+            while ((index = s.indexOf(name, index + 1)) != -1) {
+                output.add(new AddressPart(name, index));
             }
         }
         return output;
     }
-    private static SplitAddress split(String sourceAddress){
+
+    private static SplitAddress split(String sourceAddress) {
         //事前准备
-        String beautyAddress = sourceAddress.replaceAll("[\\s]+","");
+        String beautyAddress = sourceAddress.replaceAll("[\\s]+", "");
 
         SplitAddress splitAddress = new SplitAddress();
         splitAddress.setFullAddress(sourceAddress);
 
 
         splittingAddress splittingAddress = new splittingAddress();
-        splittingAddress.splitAddress=splitAddress;
+        splittingAddress.splitAddress = splitAddress;
 
 
         String[] result = AddressSplitUtil.splitAddress(beautyAddress);
@@ -313,23 +373,27 @@ public class ShanghaiAddressSplitUtil {
         splitAddress.setCity(result[1]);
         splitAddress.setDistrict(result[2]);
         //检查是否在外省,未找到省市或者在省市中找到上海,或者找到上海的区都算作省内
-        Map<Integer, String> districtContainResult = contain(beautyAddress, DISTRICT_TO_COMMUNITY_MAP.keySet(), 0);
-        int  disIndex= washResult(beautyAddress, districtContainResult,LEVEL_1_SUFFIX_PATTERN);
-        Map<Integer, String> streetContainResult = contain(beautyAddress, STREET_TO_COMMUNITY_MAP.keySet(), 0);
-        int  streetIndex= washResult(beautyAddress, districtContainResult,LEVEL_2_SUFFIX_PATTERN);
+        List<AddressPart> districtContainResult = contain(beautyAddress, DISTRICT_TO_COMMUNITY_MAP.keySet(), 0);
+        washResult(beautyAddress, districtContainResult, LEVEL_1_SUFFIX_PATTERN);
+        List<AddressPart> streetContainResult = contain(beautyAddress, STREET_TO_COMMUNITY_MAP.keySet(), 0);
+        washResult(beautyAddress, districtContainResult, LEVEL_2_SUFFIX_PATTERN);
         splitAddress.setAddr(result[3]);
-        if (!((result[0].isEmpty()|| result[0].equals("上海市")) && (result[1].isEmpty()  || result[1].equals("上海市"))||
-                !districtContainResult.isEmpty()||!streetContainResult.isEmpty())) {
+        if (!((result[0].isEmpty() || result[0].equals("上海市")) && (result[1].isEmpty() || result[1].equals("上海市")) ||
+                !districtContainResult.isEmpty() || !streetContainResult.isEmpty())) {
             splitAddress.setStatus(2);
             return splitAddress;
         }
-        if (!districtContainResult.isEmpty()){
-            if (disIndex!=-1){
-                String district=districtContainResult.get(disIndex);
-                threeLevelAddress disTLA = All_COMMUNITY_IN_SHANGHAI.get(DISTRICT_TO_COMMUNITY_MAP.get(district).get(0)).get(0);
-                splitAddress.setDistrict(disTLA.districtFullName);
-                splitAddress.setDistrictCode(disTLA.districtCode);
-            }
+        AddressPart bestDistrict = null;
+        if (!districtContainResult.isEmpty()) {
+            bestDistrict = districtContainResult.stream()
+                    .filter(p -> p.completeMatch)
+                    .min(Comparator.comparingInt(p -> p.location))
+                    .orElseGet(() -> districtContainResult.stream()
+                            .min(Comparator.comparingInt(p -> p.location)).get());
+            String district = bestDistrict.address;
+            ThreeLevelAddress disTLA = All_COMMUNITY_IN_SHANGHAI.get(DISTRICT_TO_COMMUNITY_MAP.get(district).get(0)).get(0);
+            splitAddress.setDistrict(disTLA.districtFullName);
+            splitAddress.setDistrictCode(disTLA.districtCode);
         }
         splitAddress.setProvince("上海市");
         splitAddress.setCity("上海市");
@@ -342,14 +406,15 @@ public class ShanghaiAddressSplitUtil {
         splittingAddress.guessFirstMatch();
 
 
-        if (splittingAddress.street!=-1||splittingAddress.community!=-1){
-            splitAddress.setStreet(splittingAddress.threeLevelAddress.streetFullName);
-            splitAddress.setStreetCode(splittingAddress.threeLevelAddress.streetCode);
+        if (splittingAddress.bestStreet != null || splittingAddress.bestCommunity != null) {
             splitAddress.setDistrict(splittingAddress.threeLevelAddress.districtFullName);
             splitAddress.setDistrictCode(splittingAddress.threeLevelAddress.districtCode);
-
         }
-        if (splittingAddress.community!=-1){
+        if (splittingAddress.bestStreet != null) {
+            splitAddress.setStreet(splittingAddress.threeLevelAddress.streetFullName);
+            splitAddress.setStreetCode(splittingAddress.threeLevelAddress.streetCode);
+        }
+        if (splittingAddress.bestCommunity != null) {
             splitAddress.setCommunity(splittingAddress.threeLevelAddress.communityFullName);
             splitAddress.setCommunityCode(splittingAddress.threeLevelAddress.communityCode);
 
@@ -357,111 +422,197 @@ public class ShanghaiAddressSplitUtil {
 
 
         //检查是否能够分离
-        if(splittingAddress.community==-1&&splittingAddress.street==-1){
+        if (splittingAddress.bestCommunity == null && splittingAddress.bestStreet == null) {
             //检查是否是非地址
-            if (UN_ADDRESS_PATTERN.matcher(splitAddress.getFullAddress()).find()){
+            if (UN_ADDRESS_PATTERN.matcher(splitAddress.getFullAddress()).find()) {
                 splitAddress.setStatus(3);
                 return splitAddress;
             }
 
-            if (disIndex!=-1){
-                String sub = beautyAddress.substring(disIndex+districtContainResult.get(disIndex).length());
+            if (bestDistrict != null) {
+                String sub = beautyAddress.substring(bestDistrict.location + bestDistrict.address.length());
                 Matcher m = LEVEL_1_SUFFIX_PATTERN.matcher(sub);
-                if (m.find()){
+                if (m.find()) {
                     sub = sub.substring(m.end());
                 }
                 splitAddress.setAddr(sub);
             }
 
             splitAddress.setStatus(1);
-            if (result[0].isEmpty()&&result[1].isEmpty()&&districtContainResult.isEmpty())splitAddress.setStatus(4);
+            if (result[0].isEmpty() && result[1].isEmpty() && districtContainResult.isEmpty())
+                splitAddress.setStatus(4);
             return splitAddress;
-        }else if (splittingAddress.street> splittingAddress.community){
-            String sub = beautyAddress.substring(splittingAddress.street+splittingAddress.streetMap.get(splittingAddress.street).length());
-            Matcher m = LEVEL_2_SUFFIX_PATTERN.matcher(sub);
-            if (m.find()){
-                sub = sub.substring(m.end());
-            }
-            splitAddress.setAddr(sub);
-        }else {
-            String sub = beautyAddress.substring(
-                    splittingAddress.community+
-                            splittingAddress.communityMap.
-                                    get(splittingAddress.community).length());
-            Matcher m = LEVEL_3_SUFFIX_PATTERN.matcher(sub);
-            if (m.find()){
-                sub = sub.substring(m.end());
+        } else {
+            AddressPart bestStreet = splittingAddress.bestStreet;
+            AddressPart bestCommunity = splittingAddress.bestCommunity;
+            AddressPart lastPart = Stream.of(bestDistrict, bestStreet, bestCommunity)
+                    .filter(Objects::nonNull)
+                    .max(Comparator.comparingInt(p -> p.location))
+                    .orElse(null);
+            String tail = "";
+            if (lastPart != null) {
+                int end = lastPart.location + lastPart.address.length();
+                tail = beautyAddress.substring(end);
+
+                Pattern suffixPat =
+                        lastPart == bestDistrict ? LEVEL_1_SUFFIX_PATTERN :
+                                lastPart == bestStreet ? LEVEL_2_SUFFIX_PATTERN :
+                                        LEVEL_3_SUFFIX_PATTERN;
+
+                Matcher m = suffixPat.matcher(tail);
+                if (m.find()) {
+                    tail = tail.substring(m.end());
+                }
+            } else {
+                tail = beautyAddress;
             }
-            splitAddress.setAddr(sub);
+            splitAddress.setAddr(tail.trim());
         }
 
         splitAddress.setStatus(0);
-        if (result[0].isEmpty()&&result[1].isEmpty()&&districtContainResult.isEmpty())splitAddress.setStatus(4);
-        if (splitAddress.getStreet().equals("自由贸易试验区"))splitAddress.setStatus(0);
+        if (result[0].isEmpty() && result[1].isEmpty() && districtContainResult.isEmpty()) splitAddress.setStatus(4);
+        if (splitAddress.getStreet().equals("自由贸易试验区")) splitAddress.setStatus(0);
         return splitAddress;
     }
 
-    private static SplitAddress beautyResult(SplitAddress splitAddress){
+    private static SplitAddress beautyResult(SplitAddress splitAddress) {
         //检查过度分割
-        if (splitAddress.getAddr().isEmpty() ||OVER_SPLIT.matcher(splitAddress.getAddr()).find()){
-            if (splitAddress.getCommunity().isEmpty()){
-                if (splitAddress.getStreet().isEmpty()){
-                    if (splitAddress.getDistrict().isEmpty()){
-                        splitAddress.setAddr("上海市"+splitAddress.getAddr());
-                    }else {
-                        splitAddress.setAddr(splitAddress.getDistrict()+splitAddress.getAddr());
+        if (splitAddress.getAddr().isEmpty() || OVER_SPLIT.matcher(splitAddress.getAddr()).find()) {
+            if (splitAddress.getCommunity().isEmpty()) {
+                if (splitAddress.getStreet().isEmpty()) {
+                    if (splitAddress.getDistrict().isEmpty()) {
+                        splitAddress.setAddr("上海市" + splitAddress.getAddr());
+                    } else {
+                        splitAddress.setAddr(splitAddress.getDistrict() + splitAddress.getAddr());
                     }
-                }else {
-                    splitAddress.setAddr(splitAddress.getStreet()+splitAddress.getAddr());
+                } else {
+                    splitAddress.setAddr(splitAddress.getStreet() + splitAddress.getAddr());
                 }
-            }else {
-                splitAddress.setAddr(splitAddress.getCommunity()+splitAddress.getAddr());
+            } else {
+                splitAddress.setAddr(splitAddress.getCommunity() + splitAddress.getAddr());
             }
         }
         //检查多号,多弄
-        splitAddress.setAddr(splitAddress.getAddr().replaceAll(String.valueOf(MULTI_ADDRESS),""));
+        splitAddress.setAddr(splitAddress.getAddr().replaceAll(String.valueOf(MULTI_ADDRESS), ""));
 
         return splitAddress;
     }
+
     /**
      * 工具入口,返回所有数据
+     *
      * @param sourceAddress 任意形式的地址,请注意,上海市外的地址仅分词到县,上海市内分词到居委
      */
-    public static List<SplitAddress> splitAddresses(String sourceAddress){
-        Matcher matcher = Pattern.compile("\\(([^()]*|\\([^()]*\\))*\\)|\\[([^\\[\\]]*|\\[[^\\[\\]]*])*]|(([^()]*|([^()]*))*)").matcher(sourceAddress);
-        List<SplitAddress> addressList =new ArrayList<>();
-        String beautyString = sourceAddress.replaceAll("\\(([^()]*|\\([^()]*\\))*\\)|\\[([^\\[\\]]*|\\[[^\\[\\]]*])*]|(([^()]*|([^()]*))*)","");
-        StringBuilder sb = new StringBuilder();
-        for (char c : beautyString.toCharArray()) {
+    public static List<SplitAddress> splitAddresses(String sourceAddress) {
+        List<SplitAddress> addressList = new ArrayList<>();
+        char[] leftParen = {'(', '{', '[', '('};
+        char[] rightParen = {')', '}', ']', ')'};
+        SplitParenString sp = splitByTopLevelParen(sourceAddress, leftParen, rightParen);
+        StringBuilder stringOutOfParen = new StringBuilder();
+        String outParen = String.join("", sp.outOfParen);
+        for (char c : outParen.toCharArray()) {
             // 检查是否为全角数字
             if (c >= '0' && c <= '9') {
                 // 转换为半角数字
-                sb.append((char) (c - '0' + '0'));
-            } else if (c=='\uE5CE'){
+                stringOutOfParen.append((char) (c - '0' + '0'));
+            } else if (c == '\uE5CE') {
                 // 奇妙的乱码,跳过
-            }else {
+            } else {
                 // 保持原字符
-                sb.append(c);
+                stringOutOfParen.append(c);
             }
         }
-        beautyString = sb.toString();
-        addressList.add(beautyResult(split(beautyString)));
-        while (matcher.find()){
-            String address=matcher.group();
-            if (address.length()<=2)continue;
-            addressList.addAll(splitAddresses(address.substring(1,address.length()-1)));
+        outParen = stringOutOfParen.toString();
+        addressList.add(beautyResult(split(outParen)));
+        for (String s : sp.inParen) {
+            addressList.addAll(splitAddresses(s));
         }
-        for (SplitAddress s :addressList)s.setSourceAddress(sourceAddress);
+        for (SplitAddress s : addressList) s.setSourceAddress(sourceAddress);
         return addressList;
     }
 
+    public static class SplitParenString {
+        List<String> outOfParen;
+        List<String> inParen;
+    }
+
+    public static SplitParenString splitByTopLevelParen(String s, char[] left, char[] right) {
+        Set<Character> leftSet = new HashSet<>();
+        for (char c : left) {
+            leftSet.add(c);
+        }
+        Set<Character> rightSet = new HashSet<>();
+        for (char c : right) {
+            rightSet.add(c);
+        }
+        SplitParenString sp = new SplitParenString();
+        sp.outOfParen = new ArrayList<>();
+        sp.inParen = new ArrayList<>();
+        StringBuilder depth0 = new StringBuilder();
+        StringBuilder depth1 = new StringBuilder();
+        int depth = 0;
+        for (int i = 0; i < s.length(); i++) {
+            char c = s.charAt(i);
+            boolean isParen = false;
+            if (leftSet.contains(c)) {
+                isParen = true;
+                if (depth > 0) {
+                    depth1.append(c);
+                }
+                depth++;
+                if (depth == 1) {
+                    String depth0Str = depth0.toString();
+                    if (!depth0Str.isEmpty()) {
+                        sp.outOfParen.add(depth0Str);
+                        depth0 = new StringBuilder();
+                    }
+                }
+            } else if (rightSet.contains(c)) {
+                isParen = true;
+                if (depth > 1) {
+                    depth1.append(c);
+                }
+                depth--;
+                if (depth == 0) {
+                    String depth1Str = depth1.toString();
+                    if (!depth1Str.isEmpty()) {
+                        sp.inParen.add(depth1Str);
+                        depth1 = new StringBuilder();
+                    }
+                }
+                if (depth < 0) {
+                    depth = 0;
+                }
+            }
+            if (!isParen) {
+                if (depth == 0) {
+                    depth0.append(c);
+                } else if (depth >= 1) {
+                    depth1.append(c);
+                }
+            }
+        }
+        String depth0Str = depth0.toString();
+        if (!depth0Str.isEmpty()) {
+            sp.outOfParen.add(depth0Str);
+        }
+        String depth1Str = depth1.toString();
+        if (!depth1Str.isEmpty()) {
+            sp.inParen.add(depth1Str);
+        }
+        return sp;
+    }
+
+
     /**
      * 工具入口,仅返回最优
+     *
      * @param sourceAddress 任意形式的地址,请注意,上海市外的地址仅分词到县,上海市内分词到居委
      */
-    public static SplitAddress splitBestAddress(String sourceAddress){
+    public static SplitAddress splitBestAddress(String sourceAddress) {
         return splitAddresses(sourceAddress).stream().max(SplitAddress::compareTo).orElse(new SplitAddress());
     }
+
     public static void main(String[] args) throws Exception {
 //        List<SplitAddress> result = new ArrayList<>();
 //        for (Map<String,Object> row:ExcelReaderUtils.readExcel("C:\\Users\\dxh\\IdeaProjects\\address_poi_yysz_server\\src\\main\\resources\\yysk_dmdz_address_standardization_200000_36.xlsx")){
@@ -471,12 +622,11 @@ public class ShanghaiAddressSplitUtil {
 //        ExcelReaderUtils.writeSplitAddressExcel(result,"C:\\\\Users\\\\dxh\\\\IdeaProjects\\\\address_poi_yysz_server\\\\src\\\\main\\\\resources\\\\result.xlsx");
 //        System.out.println("完成");
         new ShanghaiAddressSplitUtil().init();
-        System.out.println(splitBestAddress("新胜路88、98号3号厂房"));
-        System.out.println(splitBestAddress("新胜路88-98号3号厂房"));
-        System.out.println(splitBestAddress("新胜路、98号3号厂房"));
-        System.out.println(splitBestAddress("新胜路88\\98号3号厂房"));
-        System.out.println(splitBestAddress("新胜路18、28号3号厂房"));
-        System.out.println(splitBestAddress("新胜路28号3号厂房"));
-        System.out.println(splitBestAddress("88、98号3号厂房"));
+        splitAddresses("上海市浦东新区保税区外高桥国际金融中心").forEach(System.out::println);
+        splitAddresses("上海市闵行区莘庄工业区上海汉洁科学仪器有限公司").forEach(System.out::println);
+        splitAddresses("上海市虹口区北外滩街道DK").forEach(System.out::println);
+        splitAddresses("上海市虹口区凉城新村街道好邻居(凉城路店)").forEach(System.out::println);
+        splitAddresses("上海市闵行区虹桥镇上海感图网络科技有限公司").forEach(System.out::println);
+        splitAddresses("上海市长宁区仙霞新村街道仙霞街道外来人员管理办公室").forEach(System.out::println);
     }
 }