Преглед на файлове

修复一个正则表达式的灾难性回溯问题

ximinghao преди 1 седмица
родител
ревизия
79417228fe
променени са 1 файла, в които са добавени 89 реда и са изтрити 17 реда
  1. 89 17
      src/main/java/com/skyversation/poiaddr/util/address_spliter/ShanghaiAddressSplitUtil.java

+ 89 - 17
src/main/java/com/skyversation/poiaddr/util/address_spliter/ShanghaiAddressSplitUtil.java

@@ -427,34 +427,106 @@ public class ShanghaiAddressSplitUtil {
      * 工具入口,返回所有数据
      * @param sourceAddress 任意形式的地址,请注意,上海市外的地址仅分词到县,上海市内分词到居委
      */
-    public static List<SplitAddress> splitAddresses(String sourceAddress){
-        Matcher matcher = Pattern.compile("\\(([^()]*|\\([^()]*\\))*\\)|\\[([^\\[\\]]*|\\[[^\\[\\]]*])*]|(([^()]*|([^()]*))*)").matcher(sourceAddress);
-        List<SplitAddress> addressList =new ArrayList<>();
-        String beautyString = sourceAddress.replaceAll("\\(([^()]*|\\([^()]*\\))*\\)|\\[([^\\[\\]]*|\\[[^\\[\\]]*])*]|(([^()]*|([^()]*))*)","");
-        StringBuilder sb = new StringBuilder();
-        for (char c : beautyString.toCharArray()) {
+    public static List<SplitAddress> splitAddresses(String sourceAddress) {
+        List<SplitAddress> addressList = new ArrayList<>();
+        char[] leftParen = {'(', '{', '[', '('};
+        char[] rightParen = {')', '}', ']', ')'};
+        SplitParenString sp = splitByTopLevelParen(sourceAddress, leftParen, rightParen);
+        StringBuilder stringOutOfParen = new StringBuilder();
+        String outParen = String.join("", sp.outOfParen);
+        for (char c : outParen.toCharArray()) {
             // 检查是否为全角数字
             if (c >= '0' && c <= '9') {
                 // 转换为半角数字
-                sb.append((char) (c - '0' + '0'));
-            } else if (c=='\uE5CE'){
+                stringOutOfParen.append((char) (c - '0' + '0'));
+            } else if (c == '\uE5CE') {
                 // 奇妙的乱码,跳过
-            }else {
+            } else {
                 // 保持原字符
-                sb.append(c);
+                stringOutOfParen.append(c);
             }
         }
-        beautyString = sb.toString();
-        addressList.add(beautyResult(split(beautyString)));
-        while (matcher.find()){
-            String address=matcher.group();
-            if (address.length()<=2)continue;
-            addressList.addAll(splitAddresses(address.substring(1,address.length()-1)));
+        outParen = stringOutOfParen.toString();
+        addressList.add(beautyResult(split(outParen)));
+        for (String s:sp.inParen){
+            addressList.addAll(splitAddresses(s));
         }
-        for (SplitAddress s :addressList)s.setSourceAddress(sourceAddress);
+        for (SplitAddress s : addressList) s.setSourceAddress(sourceAddress);
         return addressList;
     }
 
+    public static class SplitParenString{
+        List<String> outOfParen;
+        List<String> inParen;
+    }
+
+    public static SplitParenString splitByTopLevelParen(String s, char[] left, char[] right) {
+        Set<Character> leftSet = new HashSet<>();
+        for (char c : left) {
+            leftSet.add(c);
+        }
+        Set<Character> rightSet = new HashSet<>();
+        for (char c : right) {
+            rightSet.add(c);
+        }
+        SplitParenString sp = new SplitParenString();
+        sp.outOfParen = new ArrayList<>();
+        sp.inParen = new ArrayList<>();
+        StringBuilder depth0 = new StringBuilder();
+        StringBuilder depth1 = new StringBuilder();
+        int depth = 0;
+        for (int i = 0; i < s.length(); i++) {
+            char c = s.charAt(i);
+            boolean isParen=false;
+            if (leftSet.contains(c)) {
+                isParen=true;
+                if (depth > 0) {
+                    depth1.append(c);
+                }
+                depth++;
+                if (depth==1){
+                    String depth0Str = depth0.toString();
+                    if (!depth0Str.isEmpty()) {
+                        sp.outOfParen.add(depth0Str);
+                        depth0 = new StringBuilder();
+                    }
+                }
+            } else if (rightSet.contains(c)) {
+                isParen=true;
+                if (depth>1){
+                    depth1.append(c);
+                }
+                depth--;
+                if (depth == 0) {
+                    String depth1Str = depth1.toString();
+                    if (!depth1Str.isEmpty()){
+                        sp.inParen.add(depth1Str);
+                        depth1 = new StringBuilder();
+                    }
+                }
+                if (depth < 0) {
+                    depth = 0;
+                }
+            }
+            if (!isParen) {
+                if (depth == 0) {
+                    depth0.append(c);
+                }else if (depth >= 1) {
+                    depth1.append(c);
+                }
+            }
+        }
+        String depth0Str = depth0.toString();
+        if (!depth0Str.isEmpty()) {
+            sp.outOfParen.add(depth0Str);
+        }
+        String depth1Str = depth1.toString();
+        if (!depth1Str.isEmpty()){
+            sp.inParen.add(depth1Str);
+        }
+        return sp;
+    }
+
     /**
      * 工具入口,仅返回最优
      * @param sourceAddress 任意形式的地址,请注意,上海市外的地址仅分词到县,上海市内分词到居委