AddressSplitUtil.java 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292
  1. package com.skyversation.poiaddr.util;
  2. import java.io.InputStream;
  3. import java.util.*;
  4. import java.util.regex.Matcher;
  5. import java.util.regex.Pattern;
  6. import java.util.stream.Collectors;
  7. public class AddressSplitUtil {
  8. /**省级行政单位简称->全称映射表
  9. * 只读,请勿修改
  10. * */
  11. public static final Map<String,String> LEVEL_1_NAME_MAP;
  12. /**市级行政单位简称->全称映射表
  13. * 只读,请勿修改
  14. * */
  15. public static final Map<String,String> LEVEL_2_NAME_MAP;
  16. /**县级行政单位简称->全称映射表
  17. * 只读,请勿修改
  18. * */
  19. public static final Map<String,String> LEVEL_3_NAME_MAP;
  20. /**省市县三级行政单位简称树
  21. * 只读,请勿修改
  22. * */
  23. public static final Map<String,Map<String,Set<String>>> SIMPLE_NAME_TREE;
  24. private static final Map<String,Set<String>> All_CITY_IN_TREE;
  25. private static final Pattern LEVEL_1_SUFFIX_PATTERN = Pattern.compile("^(?:维吾尔|((?:(?!省|市|自治区).)*?族))?(?:省|市|自治区)");
  26. private static final Pattern LEVEL_2_SUFFIX_PATTERN = Pattern.compile("^(?:市|自治州|地区|盟)");
  27. private static final Pattern LEVEL_3_SUFFIX_PATTERN = Pattern.compile("^(?:县|自治县|市|区|旗|自治旗|林区|特区)");
  28. public static final Pattern ROAD_SUFFIX_PATTERN = Pattern.compile("^(?:旅游区|[东南西北中一二三四五六七八九十公大小支新老]{0,2}(?:大街|路|大道|街|菜市场|马路|村))");
  29. static {
  30. Map<String,String> level1NameMap = new HashMap<>();
  31. Map<String,String> level2NameMap = new HashMap<>();
  32. Map<String,String> level3NameMap = new HashMap<>();
  33. Map<String,Map<String,Set<String>>> simpleNameTree = new HashMap<>();
  34. String file = "全国省市县记录.xlsx";
  35. InputStream is = AddressSplitUtil.class.getResourceAsStream(file);
  36. if (is==null) is= AddressSplitUtil.class.getResourceAsStream("/"+file);
  37. if (is==null) throw new RuntimeException("无法找到"+file);
  38. try {
  39. List<Map<String, Object>> list = ExcelReaderUtils.readExcel(is);
  40. for (Map<String,Object> row : list) {
  41. Object level1Name = row.get("省份");
  42. Object level1SimpleName = row.get("省份简称");
  43. Object level2Name = row.get("地级市");
  44. Object level2SimpleName = row.get("地级市简称");
  45. Object level3Name = row.get("县级市");
  46. Object level3SimpleName = row.get("县级市简称");
  47. if (level1SimpleName != null && level1Name!=null) {
  48. level1NameMap.put(level1SimpleName.toString(), level1Name.toString());
  49. }
  50. if (level2SimpleName != null && level2Name!=null) {
  51. level2NameMap.put(level2SimpleName.toString(), level2Name.toString());
  52. }
  53. if (level3SimpleName != null && level3Name!=null) {
  54. level3NameMap.put(level3SimpleName.toString(), level3Name.toString());
  55. }
  56. if (level1SimpleName!=null){
  57. if (!simpleNameTree.containsKey(level1SimpleName.toString())){
  58. simpleNameTree.put(level1SimpleName.toString(),new HashMap<>());
  59. }
  60. Map<String, Set<String>> level2Map = simpleNameTree.get(level1SimpleName.toString());
  61. if (level2SimpleName!=null){
  62. if(!level2Map.containsKey(level2SimpleName.toString())){
  63. level2Map.put(level2SimpleName.toString(),new HashSet<>());
  64. }
  65. if (level3SimpleName!=null){
  66. level2Map.get(level2SimpleName.toString()).add(level3SimpleName.toString());
  67. }
  68. }
  69. }
  70. }
  71. LEVEL_1_NAME_MAP = Collections.unmodifiableMap(level1NameMap);
  72. LEVEL_2_NAME_MAP = Collections.unmodifiableMap(level2NameMap);
  73. LEVEL_3_NAME_MAP = Collections.unmodifiableMap(level3NameMap);
  74. Map<String,Map<String,Set<String>>> simpleNameTree_= new HashMap<>();
  75. for (String key : simpleNameTree.keySet()){
  76. simpleNameTree_.put(key , Collections.unmodifiableMap(simpleNameTree.get(key)));
  77. }
  78. SIMPLE_NAME_TREE= Collections.unmodifiableMap(simpleNameTree_);
  79. All_CITY_IN_TREE= Collections.unmodifiableMap(SIMPLE_NAME_TREE.values().stream()
  80. .flatMap(map -> map.entrySet().stream())
  81. .collect(Collectors.toMap(
  82. Map.Entry::getKey,
  83. Map.Entry::getValue,
  84. (oldValue, newValue) -> newValue
  85. ))
  86. );
  87. } catch (Exception e) {
  88. throw new RuntimeException(e);
  89. }
  90. }
  91. private static class SplittingAddress {
  92. int province = -1;
  93. int city = -1;
  94. int county = -1;
  95. Map<Integer,String> provinceInChoose = new HashMap<>();
  96. Map<Integer,String> cityInChoose = new HashMap<>();
  97. Map<Integer,String> countyInChoose = new HashMap<>();
  98. String sourceAddress ;
  99. SplittingAddress(String sourceAddress){
  100. this.sourceAddress=sourceAddress;
  101. }
  102. String[] toStringList(){
  103. String[] output = new String[4];
  104. output[0]=((province!=-1)?LEVEL_1_NAME_MAP.get(provinceInChoose.get(province)):"");
  105. output[1]=((city!=-1)?LEVEL_2_NAME_MAP.get(cityInChoose.get(city)):"");
  106. output[2]=((county!=-1)?LEVEL_3_NAME_MAP.get(countyInChoose.get(county)):"");
  107. return output;
  108. }
  109. String getOtherAddress(){
  110. int max = Math.max(province,Math.max(county,city));
  111. String maxName = "";
  112. if(max==-1){
  113. return sourceAddress;
  114. }
  115. if (province==max){
  116. maxName=provinceInChoose.get(province);
  117. }
  118. if (city==max){
  119. maxName=cityInChoose.get(city);
  120. }
  121. if (county==max){
  122. maxName=countyInChoose.get(county);
  123. }
  124. String sub = sourceAddress.substring(max+maxName.length());
  125. Matcher m = LEVEL_1_SUFFIX_PATTERN.matcher(sub);
  126. if (m.find()){
  127. sub = sub.substring(m.end());
  128. }
  129. m = LEVEL_2_SUFFIX_PATTERN.matcher(sub);
  130. if (m.find()){
  131. sub = sub.substring(m.end());
  132. }
  133. m = LEVEL_3_SUFFIX_PATTERN.matcher(sub);
  134. if (m.find()){
  135. sub = sub.substring(m.end());
  136. }
  137. return sub;
  138. }
  139. void findProvince(){
  140. Map<Integer,String> results = contain(this.sourceAddress,SIMPLE_NAME_TREE.keySet());
  141. for (int index : results.keySet()){
  142. String name = results.get(index);
  143. String sub = this.sourceAddress.substring(index+name.length());
  144. //去除南京路,北京大道型选手
  145. if (ROAD_SUFFIX_PATTERN.matcher(sub).find()){
  146. continue;
  147. }
  148. provinceInChoose.put(index,name);
  149. //匹配到后缀时直接当做第一选择
  150. if (LEVEL_1_SUFFIX_PATTERN.matcher(sub).find()){
  151. province = index;
  152. }
  153. }
  154. //仅有一个选择时当成一选
  155. if (provinceInChoose.size()==1){
  156. province = (int)provinceInChoose.keySet().toArray()[0];
  157. }
  158. }
  159. void findCity(){
  160. Map<Integer,String> results =null;
  161. //首先尝试在一选下匹配
  162. if (province!=-1){
  163. results = contain(this.sourceAddress,SIMPLE_NAME_TREE.get(provinceInChoose.
  164. get(province)).keySet());
  165. }
  166. //一选不存在或匹配无结果,直接搜全国
  167. if (results==null||results.isEmpty()){
  168. results = contain(this.sourceAddress,LEVEL_2_NAME_MAP.keySet());
  169. }
  170. Iterator<Integer> iterator = results.keySet().iterator();
  171. while (iterator.hasNext()) {
  172. int key = iterator.next();
  173. String name = results.get(key);
  174. if (key > 0 && name.equals("南县") &&"滦辉甘桦灌苍阜屏定全沂莒汝衡南郁平宁思广洛商南".indexOf(sourceAddress.charAt(key - 1)) != -1) {
  175. iterator.remove();
  176. }
  177. }
  178. for (int index : results.keySet()){
  179. String name = results.get(index);
  180. String sub = this.sourceAddress.substring(index+name.length());
  181. //去除南京路,北京大道型选手
  182. if (ROAD_SUFFIX_PATTERN.matcher(sub).find()){
  183. continue;
  184. }
  185. cityInChoose.put(index,name);
  186. //匹配到后缀时直接当做第一选择
  187. if (LEVEL_2_SUFFIX_PATTERN.matcher(sub).find()){
  188. city = index;
  189. }
  190. }
  191. //仅有一个选择时当成一选
  192. if (cityInChoose.size()==1){
  193. city = (int)cityInChoose.keySet().toArray()[0];
  194. }
  195. }
  196. void findCounty(){
  197. Map<Integer,String> results = null;
  198. //尝试一选
  199. if (city!=-1){
  200. results=contain(sourceAddress,All_CITY_IN_TREE.get(cityInChoose.get(city)));
  201. }
  202. //一选不存在或匹配无结果,先搜全省
  203. if ((results == null || results.isEmpty()) && province != -1) {
  204. results = contain(sourceAddress, SIMPLE_NAME_TREE.get(provinceInChoose.get(province)).values().stream().
  205. flatMap(Set::stream).collect(Collectors.toSet()));
  206. }
  207. //最后全国
  208. if (results == null || results.isEmpty()) {
  209. results = contain(sourceAddress, LEVEL_3_NAME_MAP.keySet());
  210. }
  211. for (int index : results.keySet()) {
  212. String name = results.get(index);
  213. String sub = this.sourceAddress.substring(index + name.length());
  214. //去除南京路,北京大道型选手
  215. if (ROAD_SUFFIX_PATTERN.matcher(sub).find()) {
  216. continue;
  217. }
  218. countyInChoose.put(index, name);
  219. //匹配到后缀时直接当做第一选择
  220. if (LEVEL_3_SUFFIX_PATTERN.matcher(sub).find()) {
  221. county = index;
  222. }
  223. }
  224. //仅有一个选择时当成一选
  225. if (countyInChoose.size()==1){
  226. county = (int)countyInChoose.keySet().toArray()[0];
  227. }
  228. }
  229. }
  230. /**
  231. * 检查字符串含有哪些字符,输出这些匹配字符的位置和字符的map
  232. * @param s 被检查字符串
  233. * @param nameList 检查范围
  234. */
  235. private static Map<Integer,String> contain(String s,Iterable<String> nameList){
  236. Map<Integer,String> output = new HashMap<Integer,String>();
  237. for (String name:nameList){
  238. if (name.isEmpty())continue;
  239. int index = -1;
  240. while ((index = s.indexOf(name, index + 1)) != -1){
  241. output.put(index,name);
  242. }
  243. }
  244. return output;
  245. }
  246. /**
  247. * 分离地址字符串,请优先使用shanghaiAddressSplitUtil,此类只分词到县<br/>
  248. * 注意,当输入的地址错误时不会自动修正,未找到的级会被空置<br/>
  249. * 例如输入"北京青浦区盈港路515号1061室" ,输出[北京市,北京市,青浦区,盈港路515号1061室]<br/>
  250. * 输入"安徽怀宁县黄墩镇老埂村双闸组" ,输出[安徽省,怀宁县,黄墩镇,盈港路515号1061室]<br/>
  251. * @return 结果为[省级,城级,县级,余下的部分],分离失败则返回null<br/>
  252. * @see ShanghaiAddressSplitUtil
  253. */
  254. public static String[] splitAddress(String address){
  255. SplittingAddress a = new SplittingAddress(address.replaceAll("\\s+",""));
  256. a.findProvince();
  257. a.findCity();
  258. a.findCounty();
  259. String[] output = a.toStringList();
  260. output[3]=(a.getOtherAddress());
  261. return output;
  262. }
  263. //测试用
  264. public static void main(String[] args) {
  265. System.out.println(Arrays.toString(splitAddress("安徽省安徽省颍上县垂岗乡陶嘴村东道场31号")));
  266. System.out.println(Arrays.toString(splitAddress("荣乐西路1058弄32号501室")));
  267. System.out.println(Arrays.toString(splitAddress("泗泾镇新家园路30弄21号402室")));
  268. System.out.println(Arrays.toString(splitAddress("山东省山东省单县莱河镇宋楼行政村霍井村041号")));
  269. System.out.println(Arrays.toString(splitAddress("安徽省五河县安徽省五河县朱顶乡胡庄村447号")));
  270. System.out.println(Arrays.toString(splitAddress("九亭镇九亭大街506弄22号101室")));
  271. System.out.println(Arrays.toString(splitAddress("陕西省宝鸡市凤翔区陕西省凤翔区尹家务乡槐中村5组024号")));
  272. System.out.println(Arrays.toString(splitAddress("江苏省海门市江苏省海门市正余镇王灶河村十三组36号")));
  273. System.out.println(Arrays.toString(splitAddress("泗泾镇古楼公路519弄1号1102室")));
  274. System.out.println(Arrays.toString(splitAddress("奉贤县奉城镇奉粮路115号")));
  275. System.out.println(Arrays.toString(splitAddress("上海市奉贤区南桥镇沪杭支路24号14幢165室")));
  276. System.out.println(Arrays.toString(splitAddress("浦东新区周浦镇年家浜路10、12号1层")));
  277. }
  278. }