|
@@ -0,0 +1,466 @@
|
|
|
+package com.skyversation.poiaddr.util;
|
|
|
+
|
|
|
+import com.skyversation.poiaddr.bean.SplitAddress;
|
|
|
+import lombok.AllArgsConstructor;
|
|
|
+import org.springframework.stereotype.Service;
|
|
|
+
|
|
|
+import javax.annotation.PostConstruct;
|
|
|
+import java.io.InputStream;
|
|
|
+import java.util.*;
|
|
|
+import java.util.regex.Matcher;
|
|
|
+import java.util.regex.Pattern;
|
|
|
+import java.util.stream.Collectors;
|
|
|
+
|
|
|
+@Service
|
|
|
+public class ShanghaiAddressSplitUtil {
|
|
|
+ @AllArgsConstructor
|
|
|
+ static class threeLevelAddress{
|
|
|
+ String district;
|
|
|
+ String street;
|
|
|
+ String community;
|
|
|
+ String districtFullName;
|
|
|
+ String streetFullName;
|
|
|
+ String communityFullName;
|
|
|
+ String districtCode;
|
|
|
+ String streetCode;
|
|
|
+ String communityCode;
|
|
|
+
|
|
|
+ }
|
|
|
+ private static Map<String,List<threeLevelAddress>> All_STREET_IN_SHANGHAI;
|
|
|
+ private static Map<String,List<threeLevelAddress>> All_COMMUNITY_IN_SHANGHAI;
|
|
|
+ private static Map<String,List<String>> DISTRICT_TO_STREET_MAP;
|
|
|
+ private static Map<String,List<String>> STREET_TO_COMMUNITY_MAP;
|
|
|
+ private static Map<String,List<String>> DISTRICT_TO_COMMUNITY_MAP;
|
|
|
+
|
|
|
+ private static final Pattern LEVEL_1_SUFFIX_PATTERN = Pattern.compile("^(?:区|新区)");
|
|
|
+
|
|
|
+ private static final Pattern LEVEL_2_SUFFIX_PATTERN = Pattern.compile("^(?:街道|路街道|镇|乡|新镇)");
|
|
|
+
|
|
|
+ private static final Pattern LEVEL_3_SUFFIX_PATTERN = Pattern.compile("^(?:居委会|管委会居委会|管委会|社区|社区居委会|居民委员会|居民区|居委|村|村委会|园区|苑|安居办|居|工作站|会)");
|
|
|
+
|
|
|
+ private static final Pattern ROAD_SUFFIX_PATTERN = AddressSplitUtil.ROAD_SUFFIX_PATTERN;
|
|
|
+
|
|
|
+ private static final Pattern UN_ADDRESS_PATTERN = Pattern.compile("http");
|
|
|
+
|
|
|
+ private static final Pattern OVER_SPLIT=Pattern.compile("^(?:[0123456789-\\-一二三四五六七八九十大A-za-z]{0,4}[街队组栋号站弄]|(?:车站|工业区|市场|农贸市场)(?![东南西北中一二三四五六七八九十公大小支新老环]路)|[A-za-z]?[0123456789-\\-])");
|
|
|
+
|
|
|
+ private static final Pattern MULTI_ADDRESS = Pattern.compile("(?<=[0-9])[号弄]?[、—/\\\\-][0-9]+(?=[号弄])");
|
|
|
+ @PostConstruct
|
|
|
+ private void init(){
|
|
|
+ System.out.println("开始初始化分词器");
|
|
|
+ Map<String,threeLevelAddress> districtMap= new HashMap<>();
|
|
|
+ Map<String,List<threeLevelAddress>> streetMap= new HashMap<>();
|
|
|
+ Map<String,List<threeLevelAddress>> communityMap= new HashMap<>();
|
|
|
+ Map<String,List<String>> districtToStreetMap=new HashMap<>();
|
|
|
+ Map<String,List<String>> streetToCommunityMap=new HashMap<>();
|
|
|
+
|
|
|
+ String file = "上海市县乡记录.xlsx";
|
|
|
+ InputStream is = ShanghaiAddressSplitUtil.class.getResourceAsStream(file);
|
|
|
+ if (is==null) is= ShanghaiAddressSplitUtil.class.getResourceAsStream("/"+file);
|
|
|
+ if (is==null) throw new RuntimeException("无法找到"+file);
|
|
|
+ try {
|
|
|
+ for (Map<String, Object> row : ExcelReaderUtils.readExcel(is)) {
|
|
|
+ String district = Optional.ofNullable(row.get("县级市简称")).map(Object::toString).orElse("");
|
|
|
+ String street = Optional.ofNullable(row.get("街道简称")).map(Object::toString).orElse("");
|
|
|
+ String community = Optional.ofNullable(row.get("居委")).map(Object::toString).orElse("");
|
|
|
+ String districtFullName = Optional.ofNullable(row.get("县级市")).map(Object::toString).orElse("");
|
|
|
+ String streetFullName = Optional.ofNullable(row.get("街道")).map(Object::toString).orElse("");
|
|
|
+ String communityFullName = Optional.ofNullable(row.get("居委")).map(Object::toString).orElse("");
|
|
|
+ String districtCode = Optional.ofNullable(row.get("县级市编码")).map(Object::toString).orElse("");
|
|
|
+ String streetCode = Optional.ofNullable(row.get("街道编码")).map(Object::toString).orElse("");
|
|
|
+ String communityCode = Optional.ofNullable(row.get("居委编码")).map(Object::toString).orElse("");
|
|
|
+ initData(district, street, community, districtFullName, streetFullName, communityFullName,districtCode, streetCode, communityCode, districtMap, streetMap, communityMap, districtToStreetMap, streetToCommunityMap);
|
|
|
+ }
|
|
|
+ //自贸区
|
|
|
+ initData("浦东", "试验区","", "浦东新区", "自由贸易试验区","","310115","","", districtMap, streetMap, communityMap, districtToStreetMap, streetToCommunityMap);
|
|
|
+ //松江镇特别处理
|
|
|
+ initData("松江", "松江","", "松江区", "","","310117","","", districtMap, streetMap, communityMap, districtToStreetMap, streetToCommunityMap);
|
|
|
+ //金山工业区
|
|
|
+ initData("金山", "金山工业区","", "金山区", "金山工业区","","310116","","", districtMap, streetMap, communityMap, districtToStreetMap, streetToCommunityMap);
|
|
|
+
|
|
|
+ } catch (Exception e) {
|
|
|
+ throw new RuntimeException(e);
|
|
|
+ }
|
|
|
+ All_STREET_IN_SHANGHAI = Collections.unmodifiableMap(streetMap);
|
|
|
+ All_COMMUNITY_IN_SHANGHAI = Collections.unmodifiableMap(communityMap);
|
|
|
+ DISTRICT_TO_STREET_MAP=Collections.unmodifiableMap(districtToStreetMap);
|
|
|
+ STREET_TO_COMMUNITY_MAP=Collections.unmodifiableMap(streetToCommunityMap);
|
|
|
+ DISTRICT_TO_COMMUNITY_MAP=Collections.unmodifiableMap(DISTRICT_TO_STREET_MAP.entrySet().stream()
|
|
|
+ .collect(Collectors.toMap(
|
|
|
+ Map.Entry::getKey,
|
|
|
+ entry -> entry.getValue().stream()
|
|
|
+ .flatMap(street -> STREET_TO_COMMUNITY_MAP.getOrDefault(street, Collections.emptyList()).stream())
|
|
|
+ .collect(Collectors.toList())
|
|
|
+ )));
|
|
|
+ System.out.println("分词器初始化完成");
|
|
|
+ }
|
|
|
+
|
|
|
+ private static void initData(String district, String street, String community, String districtFullName, String streetFullName, String communityFullName,String districtCode, String streetCode, String communityCode, Map<String, threeLevelAddress> districtMap, Map<String, List<threeLevelAddress>> streetMap, Map<String, List<threeLevelAddress>> communityMap, Map<String, List<String>> districtToStreetMap, Map<String, List<String>> streetToCommunityMap) {
|
|
|
+ threeLevelAddress add = new threeLevelAddress(district, street, community, districtFullName, streetFullName, communityFullName, districtCode ,streetCode, communityCode);
|
|
|
+ districtMap.put(district,add);
|
|
|
+ if (!streetMap.containsKey(street)) streetMap.put(street,new ArrayList<>());
|
|
|
+ streetMap.get(street).add(add);
|
|
|
+ if (!communityMap.containsKey(community)) communityMap.put(community,new ArrayList<>());
|
|
|
+ communityMap.get(community).add(add);
|
|
|
+ if (!districtToStreetMap.containsKey(district)) districtToStreetMap.put(district,new ArrayList<>());
|
|
|
+ districtToStreetMap.get(district).add(street);
|
|
|
+ if (!streetToCommunityMap.containsKey(street)) streetToCommunityMap.put(street,new ArrayList<>());
|
|
|
+ streetToCommunityMap.get(street).add(community);
|
|
|
+ }
|
|
|
+
|
|
|
+ private static class splittingAddress{
|
|
|
+ SplitAddress splitAddress;
|
|
|
+
|
|
|
+ int street=-1;
|
|
|
+ int community=-1;
|
|
|
+
|
|
|
+ Map<Integer,String> streetMap =new HashMap<>();
|
|
|
+ Map<Integer,String> communityMap=new HashMap<>();
|
|
|
+ threeLevelAddress threeLevelAddress;
|
|
|
+
|
|
|
+ String targetString;
|
|
|
+
|
|
|
+ void findStreet(){
|
|
|
+ Map<Integer,String> results =null;
|
|
|
+ int completeMatchIndex=-1;
|
|
|
+ //首先尝试在一选下匹配
|
|
|
+ if (splitAddress.getDistrict()!=null){
|
|
|
+ results = contain(this.targetString,DISTRICT_TO_STREET_MAP.get(splitAddress.getDistrict()),0);
|
|
|
+ completeMatchIndex = washResult(this.targetString,results,LEVEL_2_SUFFIX_PATTERN,LEVEL_3_SUFFIX_PATTERN,LEVEL_1_SUFFIX_PATTERN);
|
|
|
+ }
|
|
|
+ //一选不存在或匹配无结果,直接搜全国
|
|
|
+ if (results==null||results.isEmpty()){
|
|
|
+ results = contain(this.targetString,All_STREET_IN_SHANGHAI.keySet(),0);
|
|
|
+ if (completeMatchIndex==-1)completeMatchIndex = washResult(this.targetString,results,LEVEL_2_SUFFIX_PATTERN,LEVEL_3_SUFFIX_PATTERN,LEVEL_1_SUFFIX_PATTERN);
|
|
|
+
|
|
|
+ }
|
|
|
+
|
|
|
+ streetMap.putAll(results);
|
|
|
+ street=completeMatchIndex;
|
|
|
+ //仅有一个选择时当成一选
|
|
|
+ if (streetMap.size()==1){
|
|
|
+ street = (int)streetMap.keySet().toArray()[0];
|
|
|
+ }
|
|
|
+
|
|
|
+ }
|
|
|
+
|
|
|
+ void findCommunity(){
|
|
|
+ Map<Integer,String> results = null;
|
|
|
+ int completeMatchCommunity=-1;
|
|
|
+ String sub=targetString;
|
|
|
+ //尝试一选
|
|
|
+ if (street!=-1){
|
|
|
+ sub = targetString.substring(street+streetMap.get(street).length());
|
|
|
+ Matcher m = LEVEL_2_SUFFIX_PATTERN.matcher(sub);
|
|
|
+ if (m.find()){
|
|
|
+ sub=sub.substring(m.end());
|
|
|
+ }
|
|
|
+ results= contain(sub,STREET_TO_COMMUNITY_MAP.get(streetMap.get(street)),targetString.length()-sub.length());
|
|
|
+ completeMatchCommunity=washResult(targetString,results,LEVEL_3_SUFFIX_PATTERN,LEVEL_1_SUFFIX_PATTERN,LEVEL_2_SUFFIX_PATTERN);
|
|
|
+
|
|
|
+ }
|
|
|
+ //一选不存在或匹配无结果,先搜全区
|
|
|
+ if ((results == null || results.isEmpty()) && splitAddress.getDistrict()!=null) {
|
|
|
+ results = contain(sub, DISTRICT_TO_COMMUNITY_MAP.get(splitAddress.getDistrict()),targetString.length()-sub.length());
|
|
|
+ if (completeMatchCommunity==-1)completeMatchCommunity=washResult(targetString,results,LEVEL_3_SUFFIX_PATTERN,LEVEL_1_SUFFIX_PATTERN,LEVEL_2_SUFFIX_PATTERN);
|
|
|
+ }
|
|
|
+ //最后全市
|
|
|
+ if (results == null || results.isEmpty()) {
|
|
|
+ results = contain(sub, All_COMMUNITY_IN_SHANGHAI.keySet(),targetString.length()-sub.length());
|
|
|
+ if (completeMatchCommunity==-1)completeMatchCommunity=washResult(targetString,results,LEVEL_3_SUFFIX_PATTERN,LEVEL_1_SUFFIX_PATTERN,LEVEL_2_SUFFIX_PATTERN);
|
|
|
+ }
|
|
|
+ Iterator<Integer> iterator = results.keySet().iterator();
|
|
|
+ while (iterator.hasNext()) {
|
|
|
+ int key = iterator.next();
|
|
|
+ String name = results.get(key);
|
|
|
+ if (key > 0 && name.equals("江镇") && targetString.charAt(key - 1) == '松') {
|
|
|
+ iterator.remove();
|
|
|
+ }
|
|
|
+ }
|
|
|
+ communityMap.putAll(results);
|
|
|
+ //仅有一个选择时当成一选
|
|
|
+ if (communityMap.size()==1){
|
|
|
+ int index = (int)communityMap.keySet().toArray()[0];
|
|
|
+ if (street!=index)community=index;
|
|
|
+ }
|
|
|
+
|
|
|
+ }
|
|
|
+
|
|
|
+ void matchThreeLevelAdd(){
|
|
|
+ int handingPoint=0;
|
|
|
+ threeLevelAddress handingTLA=new threeLevelAddress("","","","","","","","","");
|
|
|
+ for (String communityName: new HashSet<>(communityMap.values())){
|
|
|
+ if (communityName.isEmpty())continue;
|
|
|
+ for(threeLevelAddress t:All_COMMUNITY_IN_SHANGHAI.get(communityName)){
|
|
|
+ int point = checkTLA(t);
|
|
|
+ if (point==221){
|
|
|
+ threeLevelAddress=t;
|
|
|
+ return;
|
|
|
+ }else if (point>handingPoint){
|
|
|
+ handingPoint=point;
|
|
|
+ handingTLA=t;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ for (String streetName:new HashSet<>(streetMap.values())){
|
|
|
+ if (streetName.isEmpty())continue;
|
|
|
+ for(threeLevelAddress t:All_STREET_IN_SHANGHAI.get(streetName)){
|
|
|
+ int point = checkTLA(t);
|
|
|
+ if (point==2111){
|
|
|
+ threeLevelAddress=t;
|
|
|
+ return;
|
|
|
+ }else if (point>handingPoint){
|
|
|
+ handingPoint=point;
|
|
|
+ handingTLA=t;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ threeLevelAddress = handingTLA;
|
|
|
+ }
|
|
|
+ int checkTLA(threeLevelAddress t){
|
|
|
+ int output=0;
|
|
|
+ if (t.district.equals(splitAddress.getDistrict()))output+=1;
|
|
|
+ if (streetMap.containsValue(t.street))output+=10;
|
|
|
+ if (street!=-1&&streetMap.get(street).equals(t.street))output+=1000;
|
|
|
+ if (communityMap.containsValue(t.community))output+=100;
|
|
|
+ if (community!=-1&&communityMap.get(community).equals(t.community))output+=1000;
|
|
|
+ if (community!=-1&&Pattern.matches(".*\\d$",communityMap.get(community)))output-=1000;
|
|
|
+ return output;
|
|
|
+ }
|
|
|
+ void guessFirstMatch(){
|
|
|
+ //先街道
|
|
|
+ if (!streetMap.isEmpty()&&street==-1) {
|
|
|
+ for (int i :streetMap.keySet()){
|
|
|
+ if (streetMap.get(i).equals(threeLevelAddress.street)&&(i<street||street==-1)) {
|
|
|
+ street=i;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ //再居委
|
|
|
+ if (community==-1&& !communityMap.isEmpty()){
|
|
|
+ for (int i :communityMap.keySet()){
|
|
|
+ if (communityMap.get(i).equals(threeLevelAddress.community)&&street!=i&&(i<community||community==-1)){
|
|
|
+ community=i;
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ }
|
|
|
+
|
|
|
+ }
|
|
|
+ }
|
|
|
+ static int washResult(String sourceAddress, Map<Integer, String> result, Pattern should, Pattern... never){
|
|
|
+ Map<Integer,String> output=new HashMap<>();
|
|
|
+ int outputInt = -1;
|
|
|
+ for (int index : result.keySet()) {
|
|
|
+ String name = result.get(index);
|
|
|
+ String sub =sourceAddress.substring(index + name.length());
|
|
|
+ //匹配到后缀时直接保留
|
|
|
+ if (should.matcher(sub).find()) {
|
|
|
+ outputInt=index;
|
|
|
+ } else {
|
|
|
+ //去除南京路,北京大道型选手
|
|
|
+ if (ROAD_SUFFIX_PATTERN.matcher(sub).find()) {
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+ boolean skip =false;
|
|
|
+ for (Pattern p :never){
|
|
|
+ if (p.matcher(sub).find())skip=true;
|
|
|
+ }
|
|
|
+ if (skip) continue;
|
|
|
+ }
|
|
|
+ output.put(index,name);
|
|
|
+ }
|
|
|
+ result.clear();
|
|
|
+ result.putAll(output);
|
|
|
+ return outputInt;
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 检查字符串含有哪些字符,输出这些匹配字符的位置和字符的map
|
|
|
+ * @param s 被检查字符串
|
|
|
+ * @param nameList 检查范围
|
|
|
+ */
|
|
|
+ private static Map<Integer,String> contain(String s,Iterable<String> nameList,int offset){
|
|
|
+ Map<Integer,String> output = new HashMap<>();
|
|
|
+ if (nameList==null){
|
|
|
+ return output;
|
|
|
+ }
|
|
|
+ for (String name:nameList){
|
|
|
+ if (name.isEmpty())continue;
|
|
|
+ int index = -1;
|
|
|
+ while ((index = s.indexOf(name, index + 1)) != -1){
|
|
|
+ output.put(index+offset,name);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return output;
|
|
|
+ }
|
|
|
+ private static SplitAddress split(String sourceAddress){
|
|
|
+ //事前准备
|
|
|
+ String beautyAddress = sourceAddress.replaceAll("[\\s]+","");
|
|
|
+
|
|
|
+ SplitAddress splitAddress = new SplitAddress();
|
|
|
+ splitAddress.setFullAddress(sourceAddress);
|
|
|
+
|
|
|
+
|
|
|
+ splittingAddress splittingAddress = new splittingAddress();
|
|
|
+ splittingAddress.splitAddress=splitAddress;
|
|
|
+
|
|
|
+
|
|
|
+ String[] result = AddressSplitUtil.splitAddress(beautyAddress);
|
|
|
+
|
|
|
+
|
|
|
+ splitAddress.setProvince(result[0]);
|
|
|
+ splitAddress.setCity(result[1]);
|
|
|
+ splitAddress.setDistrict(result[2]);
|
|
|
+ //检查是否在外省,未找到省市或者在省市中找到上海,或者找到上海的区都算作省内
|
|
|
+ Map<Integer, String> districtContainResult = contain(beautyAddress, DISTRICT_TO_COMMUNITY_MAP.keySet(), 0);
|
|
|
+ washResult(beautyAddress, districtContainResult,LEVEL_1_SUFFIX_PATTERN);
|
|
|
+ if (!((result[0].isEmpty()|| result[0].equals("上海市")) && (result[1].isEmpty() || result[1].equals("上海市")||
|
|
|
+ !districtContainResult.isEmpty()))) {
|
|
|
+ splitAddress.setStatus(2);
|
|
|
+ splitAddress.setAddr(result[3]);
|
|
|
+ return splitAddress;
|
|
|
+ }
|
|
|
+ splitAddress.setProvince("上海市");
|
|
|
+ splitAddress.setCity("上海市");
|
|
|
+ splitAddress.setCityCode("3101");
|
|
|
+ splittingAddress.targetString = beautyAddress;
|
|
|
+ //开始省内分词
|
|
|
+ splittingAddress.findStreet();
|
|
|
+ splittingAddress.findCommunity();
|
|
|
+ splittingAddress.matchThreeLevelAdd();
|
|
|
+ splittingAddress.guessFirstMatch();
|
|
|
+
|
|
|
+
|
|
|
+ if (splittingAddress.street!=-1||splittingAddress.community!=-1){
|
|
|
+ splitAddress.setStreet(splittingAddress.threeLevelAddress.streetFullName);
|
|
|
+ splitAddress.setStreetCode(splittingAddress.threeLevelAddress.streetCode);
|
|
|
+
|
|
|
+ }
|
|
|
+ if (splittingAddress.community!=-1){
|
|
|
+ splitAddress.setCommunity(splittingAddress.threeLevelAddress.communityFullName);
|
|
|
+ splitAddress.setCommunityCode(splittingAddress.threeLevelAddress.communityCode);
|
|
|
+
|
|
|
+ }
|
|
|
+ splitAddress.setDistrict(splittingAddress.threeLevelAddress.districtFullName);
|
|
|
+ splitAddress.setDistrictCode(splittingAddress.threeLevelAddress.districtCode);
|
|
|
+
|
|
|
+
|
|
|
+ //检查是否能够分离
|
|
|
+ if(splittingAddress.community==-1&&splittingAddress.street==-1){
|
|
|
+ //检查是否是非地址
|
|
|
+ if (UN_ADDRESS_PATTERN.matcher(splitAddress.getFullAddress()).find()){
|
|
|
+ splitAddress.setStatus(3);
|
|
|
+ return splitAddress;
|
|
|
+ }
|
|
|
+ splitAddress.setDistrict(result[2]);
|
|
|
+ splitAddress.setAddr(result[3]);
|
|
|
+
|
|
|
+ splitAddress.setStatus(1);
|
|
|
+ if (result[0].isEmpty()&&result[1].isEmpty()&&districtContainResult.isEmpty())splitAddress.setStatus(4);
|
|
|
+ return splitAddress;
|
|
|
+ }
|
|
|
+ //尝试分离
|
|
|
+ if (splittingAddress.street> splittingAddress.community){
|
|
|
+ String sub = beautyAddress.substring(splittingAddress.street+splittingAddress.streetMap.get(splittingAddress.street).length());
|
|
|
+ Matcher m = LEVEL_2_SUFFIX_PATTERN.matcher(sub);
|
|
|
+ if (m.find()){
|
|
|
+ sub = sub.substring(m.end());
|
|
|
+ }
|
|
|
+ splitAddress.setAddr(sub);
|
|
|
+ }else {
|
|
|
+ String sub = beautyAddress.substring(
|
|
|
+ splittingAddress.community+
|
|
|
+ splittingAddress.communityMap.
|
|
|
+ get(splittingAddress.community).length());
|
|
|
+ Matcher m = LEVEL_3_SUFFIX_PATTERN.matcher(sub);
|
|
|
+ if (m.find()){
|
|
|
+ sub = sub.substring(m.end());
|
|
|
+ }
|
|
|
+ splitAddress.setAddr(sub);
|
|
|
+ }
|
|
|
+ splitAddress.setStatus(0);
|
|
|
+ if (result[0].isEmpty()&&result[1].isEmpty()&&districtContainResult.isEmpty())splitAddress.setStatus(4);
|
|
|
+ if (splitAddress.getStreet().equals("自由贸易试验区"))splitAddress.setStatus(0);
|
|
|
+ return splitAddress;
|
|
|
+ }
|
|
|
+
|
|
|
+ private static SplitAddress beautyResult(SplitAddress splitAddress){
|
|
|
+ //检查过度分割
|
|
|
+ if (splitAddress.getAddr().isEmpty() ||OVER_SPLIT.matcher(splitAddress.getAddr()).find()){
|
|
|
+ if (splitAddress.getCommunity().isEmpty()){
|
|
|
+ if (splitAddress.getStreet().isEmpty()){
|
|
|
+ if (splitAddress.getDistrict().isEmpty()){
|
|
|
+ splitAddress.setAddr("上海市"+splitAddress.getAddr());
|
|
|
+ }else {
|
|
|
+ splitAddress.setAddr(splitAddress.getDistrict()+splitAddress.getAddr());
|
|
|
+ }
|
|
|
+ }else {
|
|
|
+ splitAddress.setAddr(splitAddress.getStreet()+splitAddress.getAddr());
|
|
|
+ }
|
|
|
+ }else {
|
|
|
+ splitAddress.setAddr(splitAddress.getCommunity()+splitAddress.getAddr());
|
|
|
+ }
|
|
|
+ }
|
|
|
+ //检查多号,多弄
|
|
|
+ splitAddress.setAddr(splitAddress.getAddr().replaceAll(String.valueOf(MULTI_ADDRESS),""));
|
|
|
+
|
|
|
+ return splitAddress;
|
|
|
+ }
|
|
|
+ /**
|
|
|
+ * 工具入口,返回所有数据
|
|
|
+ * @param sourceAddress 任意形式的地址,请注意,上海市外的地址仅分词到县,上海市内分词到居委
|
|
|
+ */
|
|
|
+ public static List<SplitAddress> splitAddresses(String sourceAddress){
|
|
|
+ Matcher matcher = Pattern.compile("\\(([^()]*|\\([^()]*\\))*\\)|\\[([^\\[\\]]*|\\[[^\\[\\]]*])*]|(([^()]*|([^()]*))*)").matcher(sourceAddress);
|
|
|
+ List<SplitAddress> addressList =new ArrayList<>();
|
|
|
+ String beautyString = sourceAddress.replaceAll("\\(([^()]*|\\([^()]*\\))*\\)|\\[([^\\[\\]]*|\\[[^\\[\\]]*])*]|(([^()]*|([^()]*))*)","");
|
|
|
+ StringBuilder sb = new StringBuilder();
|
|
|
+ for (char c : beautyString.toCharArray()) {
|
|
|
+ // 检查是否为全角数字
|
|
|
+ if (c >= '0' && c <= '9') {
|
|
|
+ // 转换为半角数字
|
|
|
+ sb.append((char) (c - '0' + '0'));
|
|
|
+ } else if (c=='\uE5CE'){
|
|
|
+ // 奇妙的乱码,跳过
|
|
|
+ }else {
|
|
|
+ // 保持原字符
|
|
|
+ sb.append(c);
|
|
|
+ }
|
|
|
+ }
|
|
|
+ beautyString = sb.toString();
|
|
|
+ addressList.add(beautyResult(split(beautyString)));
|
|
|
+ while (matcher.find()){
|
|
|
+ String address=matcher.group();
|
|
|
+ if (address.length()<=2)continue;
|
|
|
+ addressList.addAll(splitAddresses(address.substring(1,address.length()-1)));
|
|
|
+ }
|
|
|
+ for (SplitAddress s :addressList)s.setSourceAddress(sourceAddress);
|
|
|
+ return addressList;
|
|
|
+ }
|
|
|
+
|
|
|
+ /**
|
|
|
+ * 工具入口,仅返回最优
|
|
|
+ * @param sourceAddress 任意形式的地址,请注意,上海市外的地址仅分词到县,上海市内分词到居委
|
|
|
+ */
|
|
|
+ public static SplitAddress splitBestAddress(String sourceAddress){
|
|
|
+ return splitAddresses(sourceAddress).stream().max(SplitAddress::compareTo).orElse(new SplitAddress());
|
|
|
+ }
|
|
|
+ public static void main(String[] args) throws Exception {
|
|
|
+// List<SplitAddress> result = new ArrayList<>();
|
|
|
+// for (Map<String,Object> row:ExcelReaderUtils.readExcel("C:\\Users\\dxh\\IdeaProjects\\address_poi_yysz_server\\src\\main\\resources\\yysk_dmdz_address_standardization_200000_36.xlsx")){
|
|
|
+// result.add(splitAddresses(row.get("address").toString()).stream().max(SplitAddress::compareTo).orElse(new SplitAddress()));
|
|
|
+// System.out.println("正在处理:"+row.get("address"));
|
|
|
+// };
|
|
|
+// ExcelReaderUtils.writeSplitAddressExcel(result,"C:\\\\Users\\\\dxh\\\\IdeaProjects\\\\address_poi_yysz_server\\\\src\\\\main\\\\resources\\\\result.xlsx");
|
|
|
+// System.out.println("完成");
|
|
|
+ new ShanghaiAddressSplitUtil().init();
|
|
|
+ System.out.println(splitBestAddress("新胜路88、98号3号厂房"));
|
|
|
+ System.out.println(splitBestAddress("新胜路88-98号3号厂房"));
|
|
|
+ System.out.println(splitBestAddress("新胜路、98号3号厂房"));
|
|
|
+ System.out.println(splitBestAddress("新胜路88\\98号3号厂房"));
|
|
|
+ System.out.println(splitBestAddress("新胜路18、28号3号厂房"));
|
|
|
+ System.out.println(splitBestAddress("新胜路28号3号厂房"));
|
|
|
+ System.out.println(splitBestAddress("88、98号3号厂房"));
|
|
|
+ }
|
|
|
+}
|