package com.skyversation.poiaddr.util; import java.io.InputStream; import java.util.*; import java.util.regex.Matcher; import java.util.regex.Pattern; import java.util.stream.Collectors; public class AddressSplitUtil { /**省级行政单位简称->全称映射表 * 只读,请勿修改 * */ public static final Map LEVEL_1_NAME_MAP; /**市级行政单位简称->全称映射表 * 只读,请勿修改 * */ public static final Map LEVEL_2_NAME_MAP; /**县级行政单位简称->全称映射表 * 只读,请勿修改 * */ public static final Map LEVEL_3_NAME_MAP; /**省市县三级行政单位简称树 * 只读,请勿修改 * */ public static final Map>> SIMPLE_NAME_TREE; private static final Map> All_CITY_IN_TREE; private static final Pattern LEVEL_1_SUFFIX_PATTERN = Pattern.compile("^(?:维吾尔|((?:(?!省|市|自治区).)*?族))?(?:省|市|自治区)"); private static final Pattern LEVEL_2_SUFFIX_PATTERN = Pattern.compile("^(?:市|自治州|地区|盟)"); private static final Pattern LEVEL_3_SUFFIX_PATTERN = Pattern.compile("^(?:县|自治县|市|区|旗|自治旗|林区|特区)"); public static final Pattern ROAD_SUFFIX_PATTERN = Pattern.compile("^(?:旅游区|[东南西北中一二三四五六七八九十公大小支新老]{0,2}(?:大街|路|大道|街|菜市场|马路|村))"); static { Map level1NameMap = new HashMap<>(); Map level2NameMap = new HashMap<>(); Map level3NameMap = new HashMap<>(); Map>> simpleNameTree = new HashMap<>(); String file = "全国省市县记录.xlsx"; InputStream is = AddressSplitUtil.class.getResourceAsStream(file); if (is==null) is= AddressSplitUtil.class.getResourceAsStream("/"+file); if (is==null) throw new RuntimeException("无法找到"+file); try { List> list = ExcelReaderUtils.readExcel(is); for (Map row : list) { Object level1Name = row.get("省份"); Object level1SimpleName = row.get("省份简称"); Object level2Name = row.get("地级市"); Object level2SimpleName = row.get("地级市简称"); Object level3Name = row.get("县级市"); Object level3SimpleName = row.get("县级市简称"); if (level1SimpleName != null && level1Name!=null) { level1NameMap.put(level1SimpleName.toString(), level1Name.toString()); } if (level2SimpleName != null && level2Name!=null) { level2NameMap.put(level2SimpleName.toString(), level2Name.toString()); } if (level3SimpleName != null && level3Name!=null) { level3NameMap.put(level3SimpleName.toString(), level3Name.toString()); } if (level1SimpleName!=null){ if (!simpleNameTree.containsKey(level1SimpleName.toString())){ simpleNameTree.put(level1SimpleName.toString(),new HashMap<>()); } Map> level2Map = simpleNameTree.get(level1SimpleName.toString()); if (level2SimpleName!=null){ if(!level2Map.containsKey(level2SimpleName.toString())){ level2Map.put(level2SimpleName.toString(),new HashSet<>()); } if (level3SimpleName!=null){ level2Map.get(level2SimpleName.toString()).add(level3SimpleName.toString()); } } } } LEVEL_1_NAME_MAP = Collections.unmodifiableMap(level1NameMap); LEVEL_2_NAME_MAP = Collections.unmodifiableMap(level2NameMap); LEVEL_3_NAME_MAP = Collections.unmodifiableMap(level3NameMap); Map>> simpleNameTree_= new HashMap<>(); for (String key : simpleNameTree.keySet()){ simpleNameTree_.put(key , Collections.unmodifiableMap(simpleNameTree.get(key))); } SIMPLE_NAME_TREE= Collections.unmodifiableMap(simpleNameTree_); All_CITY_IN_TREE= Collections.unmodifiableMap(SIMPLE_NAME_TREE.values().stream() .flatMap(map -> map.entrySet().stream()) .collect(Collectors.toMap( Map.Entry::getKey, Map.Entry::getValue, (oldValue, newValue) -> newValue )) ); } catch (Exception e) { throw new RuntimeException(e); } } private static class SplittingAddress { int province = -1; int city = -1; int county = -1; Map provinceInChoose = new HashMap<>(); Map cityInChoose = new HashMap<>(); Map countyInChoose = new HashMap<>(); String sourceAddress ; SplittingAddress(String sourceAddress){ this.sourceAddress=sourceAddress; } String[] toStringList(){ String[] output = new String[4]; output[0]=((province!=-1)?LEVEL_1_NAME_MAP.get(provinceInChoose.get(province)):""); output[1]=((city!=-1)?LEVEL_2_NAME_MAP.get(cityInChoose.get(city)):""); output[2]=((county!=-1)?LEVEL_3_NAME_MAP.get(countyInChoose.get(county)):""); return output; } String getOtherAddress(){ int max = Math.max(province,Math.max(county,city)); String maxName = ""; if(max==-1){ return sourceAddress; } if (province==max){ maxName=provinceInChoose.get(province); } if (city==max){ maxName=cityInChoose.get(city); } if (county==max){ maxName=countyInChoose.get(county); } String sub = sourceAddress.substring(max+maxName.length()); Matcher m = LEVEL_1_SUFFIX_PATTERN.matcher(sub); if (m.find()){ sub = sub.substring(m.end()); } m = LEVEL_2_SUFFIX_PATTERN.matcher(sub); if (m.find()){ sub = sub.substring(m.end()); } m = LEVEL_3_SUFFIX_PATTERN.matcher(sub); if (m.find()){ sub = sub.substring(m.end()); } return sub; } void findProvince(){ Map results = contain(this.sourceAddress,SIMPLE_NAME_TREE.keySet()); for (int index : results.keySet()){ String name = results.get(index); String sub = this.sourceAddress.substring(index+name.length()); //去除南京路,北京大道型选手 if (ROAD_SUFFIX_PATTERN.matcher(sub).find()){ continue; } provinceInChoose.put(index,name); //匹配到后缀时直接当做第一选择 if (LEVEL_1_SUFFIX_PATTERN.matcher(sub).find()){ province = index; } } //仅有一个选择时当成一选 if (provinceInChoose.size()==1){ province = (int)provinceInChoose.keySet().toArray()[0]; } } void findCity(){ Map results =null; //首先尝试在一选下匹配 if (province!=-1){ results = contain(this.sourceAddress,SIMPLE_NAME_TREE.get(provinceInChoose. get(province)).keySet()); } //一选不存在或匹配无结果,直接搜全国 if (results==null||results.isEmpty()){ results = contain(this.sourceAddress,LEVEL_2_NAME_MAP.keySet()); } Iterator iterator = results.keySet().iterator(); while (iterator.hasNext()) { int key = iterator.next(); String name = results.get(key); if (key > 0 && name.equals("南县") &&"滦辉甘桦灌苍阜屏定全沂莒汝衡南郁平宁思广洛商南".indexOf(sourceAddress.charAt(key - 1)) != -1) { iterator.remove(); } } for (int index : results.keySet()){ String name = results.get(index); String sub = this.sourceAddress.substring(index+name.length()); //去除南京路,北京大道型选手 if (ROAD_SUFFIX_PATTERN.matcher(sub).find()){ continue; } cityInChoose.put(index,name); //匹配到后缀时直接当做第一选择 if (LEVEL_2_SUFFIX_PATTERN.matcher(sub).find()){ city = index; } } //仅有一个选择时当成一选 if (cityInChoose.size()==1){ city = (int)cityInChoose.keySet().toArray()[0]; } } void findCounty(){ Map results = null; //尝试一选 if (city!=-1){ results=contain(sourceAddress,All_CITY_IN_TREE.get(cityInChoose.get(city))); } //一选不存在或匹配无结果,先搜全省 if ((results == null || results.isEmpty()) && province != -1) { results = contain(sourceAddress, SIMPLE_NAME_TREE.get(provinceInChoose.get(province)).values().stream(). flatMap(Set::stream).collect(Collectors.toSet())); } //最后全国 if (results == null || results.isEmpty()) { results = contain(sourceAddress, LEVEL_3_NAME_MAP.keySet()); } for (int index : results.keySet()) { String name = results.get(index); String sub = this.sourceAddress.substring(index + name.length()); //去除南京路,北京大道型选手 if (ROAD_SUFFIX_PATTERN.matcher(sub).find()) { continue; } countyInChoose.put(index, name); //匹配到后缀时直接当做第一选择 if (LEVEL_3_SUFFIX_PATTERN.matcher(sub).find()) { county = index; } } //仅有一个选择时当成一选 if (countyInChoose.size()==1){ county = (int)countyInChoose.keySet().toArray()[0]; } } } /** * 检查字符串含有哪些字符,输出这些匹配字符的位置和字符的map * @param s 被检查字符串 * @param nameList 检查范围 */ private static Map contain(String s,Iterable nameList){ Map output = new HashMap(); for (String name:nameList){ if (name.isEmpty())continue; int index = -1; while ((index = s.indexOf(name, index + 1)) != -1){ output.put(index,name); } } return output; } /** * 分离地址字符串,请优先使用shanghaiAddressSplitUtil,此类只分词到县
* 注意,当输入的地址错误时不会自动修正,未找到的级会被空置
* 例如输入"北京青浦区盈港路515号1061室" ,输出[北京市,北京市,青浦区,盈港路515号1061室]
* 输入"安徽怀宁县黄墩镇老埂村双闸组" ,输出[安徽省,怀宁县,黄墩镇,盈港路515号1061室]
* @return 结果为[省级,城级,县级,余下的部分],分离失败则返回null
* @see ShanghaiAddressSplitUtil */ public static String[] splitAddress(String address){ SplittingAddress a = new SplittingAddress(address.replaceAll("\\s+","")); a.findProvince(); a.findCity(); a.findCounty(); String[] output = a.toStringList(); output[3]=(a.getOtherAddress()); return output; } //测试用 public static void main(String[] args) { System.out.println(Arrays.toString(splitAddress("安徽省安徽省颍上县垂岗乡陶嘴村东道场31号"))); System.out.println(Arrays.toString(splitAddress("荣乐西路1058弄32号501室"))); System.out.println(Arrays.toString(splitAddress("泗泾镇新家园路30弄21号402室"))); System.out.println(Arrays.toString(splitAddress("山东省山东省单县莱河镇宋楼行政村霍井村041号"))); System.out.println(Arrays.toString(splitAddress("安徽省五河县安徽省五河县朱顶乡胡庄村447号"))); System.out.println(Arrays.toString(splitAddress("九亭镇九亭大街506弄22号101室"))); System.out.println(Arrays.toString(splitAddress("陕西省宝鸡市凤翔区陕西省凤翔区尹家务乡槐中村5组024号"))); System.out.println(Arrays.toString(splitAddress("江苏省海门市江苏省海门市正余镇王灶河村十三组36号"))); System.out.println(Arrays.toString(splitAddress("泗泾镇古楼公路519弄1号1102室"))); System.out.println(Arrays.toString(splitAddress("奉贤县奉城镇奉粮路115号"))); System.out.println(Arrays.toString(splitAddress("上海市奉贤区南桥镇沪杭支路24号14幢165室"))); System.out.println(Arrays.toString(splitAddress("浦东新区周浦镇年家浜路10、12号1层"))); } }