2020-04-23修改部分获取首字母异常,加了py.length() > 0判断


multi_pinyin.txt是多音字库(pinyin4j源码包里有),可以自己改个名字以及存储路径来扩展里面的多音字,里面并不是全的,比如“重启”需要添加“重启 (chong2,qi3)”才能正确识别

import net.sourceforge.pinyin4j.format.HanyuPinyinCaseType;
import net.sourceforge.pinyin4j.format.HanyuPinyinOutputFormat;
import net.sourceforge.pinyin4j.format.HanyuPinyinToneType;
import net.sourceforge.pinyin4j.format.HanyuPinyinVCharType;
import net.sourceforge.pinyin4j.format.exception.BadHanyuPinyinOutputFormatCombination;
import net.sourceforge.pinyin4j.multipinyin.Trie;
import org.springframework.core.io.ClassPathResource;
import org.springframework.util.StringUtils;

import java.io.BufferedInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;

public class PinyinHelper {

     * 自定义拼音扩展库,从classpath下查找
    private static final String MULTI_PINYIN_APPENDER = "multi_pinyin_appender.txt";

    public static String toHanYuPinyinString(String str, HanyuPinyinOutputFormat outputFormat, String separate, boolean retain) throws BadHanyuPinyinOutputFormatCombination {
        ChineseToPinyinResource resource = ChineseToPinyinResource.getInstance();
        StringBuilder resultPinyinStrBuf = new StringBuilder();
        char[] chars = str.toCharArray();
        for (int i = 0; i < chars.length; i++) {
            // 匹配到的最长的结果
            String result = null;
            char ch = chars[i];
            Trie currentTrie = resource.getUnicodeToHanyuPinyinTable();
            int success = i;
            int current = i;
            do {
                String hexStr = Integer.toHexString((int) ch).toUpperCase();
                currentTrie = currentTrie.get(hexStr);
                if (currentTrie != null) {
                    if (currentTrie.getPinyin() != null) {
                        result = currentTrie.getPinyin();
                        success = current;
                    currentTrie = currentTrie.getNextTire();
                } else {

                if (current < chars.length) {
                    ch = chars[current];
                } else {
            } while (currentTrie != null);

            // 如果在前缀树中没有匹配到,那么它就不能转换为拼音,直接输出或者去掉
            if (result == null) {
                if (retain) {
                    if (i != 0 && current != success && resultPinyinStrBuf.lastIndexOf(separate) != resultPinyinStrBuf.length() - 1) {
            } else {
                String[] pinyinStrArray = resource.parsePinyinString(result);
                if (pinyinStrArray != null) {
                    for (int j = 0; j < pinyinStrArray.length; j++) {
                        if (i != 0 && current != success && resultPinyinStrBuf.lastIndexOf(separate) != resultPinyinStrBuf.length() - 1) {
                        resultPinyinStrBuf.append(PinyinFormatter.formatHanyuPinyin(pinyinStrArray[j], outputFormat));
                        // 不是最后一个,(也不是拼音的最后一个,并且不是最后匹配成功的)
                        if (current < chars.length || (j < pinyinStrArray.length - 1 && i != success)) {
                        if (i == success) {
            i = success;
        return resultPinyinStrBuf.toString();

    static class PinyinFormatter {

        static String formatHanyuPinyin(String pinyinStr, HanyuPinyinOutputFormat outputFormat)
                throws BadHanyuPinyinOutputFormatCombination {
            if ((HanyuPinyinToneType.WITH_TONE_MARK == outputFormat.getToneType())
                    && ((HanyuPinyinVCharType.WITH_V == outputFormat.getVCharType()) || (HanyuPinyinVCharType.WITH_U_AND_COLON == outputFormat
                    .getVCharType()))) {
                throw new BadHanyuPinyinOutputFormatCombination("tone marks cannot be added to v or u:");

            if (HanyuPinyinToneType.WITHOUT_TONE == outputFormat.getToneType()) {
                pinyinStr = pinyinStr.replaceAll("[1-5]", "");
            } else if (HanyuPinyinToneType.WITH_TONE_MARK == outputFormat.getToneType()) {
                pinyinStr = pinyinStr.replaceAll("u:", "v");
                pinyinStr = convertToneNumber2ToneMark(pinyinStr);

            if (HanyuPinyinVCharType.WITH_V == outputFormat.getVCharType()) {
                pinyinStr = pinyinStr.replaceAll("u:", "v");
            } else if (HanyuPinyinVCharType.WITH_U_UNICODE == outputFormat.getVCharType()) {
                pinyinStr = pinyinStr.replaceAll("u:", "ü");

            if (HanyuPinyinCaseType.UPPERCASE == outputFormat.getCaseType()) {
                pinyinStr = pinyinStr.toUpperCase();
            return pinyinStr;

         * Convert tone numbers to tone marks using Unicode <br/><br/>
         * <b>Algorithm for determining location of tone mark</b><br/>
         * <p>
         * A simple algorithm for determining the vowel on which the tone mark
         * appears is as follows:<br/>
         * <ol>
         * <li>First, look for an "a" or an "e". If either vowel appears, it takes
         * the tone mark. There are no possible pinyin syllables that contain both
         * an "a" and an "e".
         * <li>If there is no "a" or "e", look for an "ou". If "ou" appears, then
         * the "o" takes the tone mark.
         * <li>If none of the above cases hold, then the last vowel in the syllable
         * takes the tone mark.
         * </ol>
         * @param pinyinStr the ascii represention with tone numbers
         * @return the unicode represention with tone marks
        private static String convertToneNumber2ToneMark(final String pinyinStr) {
            String lowerCasePinyinStr = pinyinStr.toLowerCase();

            if (lowerCasePinyinStr.matches("[a-z]*[1-5]?")) {
                final char defautlCharValue = '$';
                final int defautlIndexValue = -1;

                char unmarkedVowel = defautlCharValue;
                int indexOfUnmarkedVowel = defautlIndexValue;

                final char charA = 'a';
                final char charE = 'e';
                final String ouStr = "ou";
                final String allUnmarkedVowelStr = "aeiouv";
                final String allMarkedVowelStr = "āáăàaēéĕèeīíĭìiōóŏòoūúŭùuǖǘǚǜü";

                if (lowerCasePinyinStr.matches("[a-z]*[1-5]")) {

                    int tuneNumber =
                            Character.getNumericValue(lowerCasePinyinStr.charAt(lowerCasePinyinStr.length() - 1));

                    int indexOfA = lowerCasePinyinStr.indexOf(charA);
                    int indexOfE = lowerCasePinyinStr.indexOf(charE);
                    int ouIndex = lowerCasePinyinStr.indexOf(ouStr);

                    if (-1 != indexOfA) {
                        indexOfUnmarkedVowel = indexOfA;
                        unmarkedVowel = charA;
                    } else if (-1 != indexOfE) {
                        indexOfUnmarkedVowel = indexOfE;
                        unmarkedVowel = charE;
                    } else if (-1 != ouIndex) {
                        indexOfUnmarkedVowel = ouIndex;
                        unmarkedVowel = ouStr.charAt(0);
                    } else {
                        for (int i = lowerCasePinyinStr.length() - 1; i >= 0; i--) {
                            if (String.valueOf(lowerCasePinyinStr.charAt(i)).matches(
                                    "[" + allUnmarkedVowelStr + "]")) {
                                indexOfUnmarkedVowel = i;
                                unmarkedVowel = lowerCasePinyinStr.charAt(i);

                    if ((defautlCharValue != unmarkedVowel) && (defautlIndexValue != indexOfUnmarkedVowel)) {
                        int rowIndex = allUnmarkedVowelStr.indexOf(unmarkedVowel);
                        int columnIndex = tuneNumber - 1;

                        int vowelLocation = rowIndex * 5 + columnIndex;

                        char markedVowel = allMarkedVowelStr.charAt(vowelLocation);

                        return lowerCasePinyinStr.substring(0, indexOfUnmarkedVowel).replaceAll("v", "ü")
                                + markedVowel
                                + lowerCasePinyinStr.substring(indexOfUnmarkedVowel + 1,
                                lowerCasePinyinStr.length() - 1).replaceAll("v", "ü");

                    } else
                    // error happens in the procedure of locating vowel
                        return lowerCasePinyinStr;
                } else
                // input string has no any tune number
                    // only replace v with ü (umlat) character
                    return lowerCasePinyinStr.replaceAll("v", "ü");
            } else
            // bad format
                return lowerCasePinyinStr;


    static class ChineseToPinyinResource {

         * A hash table contains <Unicode, HanyuPinyin> pairs
        private Trie unicodeToHanyuPinyinTable = null;

         * @param unicodeToHanyuPinyinTable The unicodeToHanyuPinyinTable to set.
        private void setUnicodeToHanyuPinyinTable(Trie unicodeToHanyuPinyinTable) {
            this.unicodeToHanyuPinyinTable = unicodeToHanyuPinyinTable;

         * @return Returns the unicodeToHanyuPinyinTable.
        Trie getUnicodeToHanyuPinyinTable() {
            return unicodeToHanyuPinyinTable;

         * Private constructor as part of the singleton pattern.
        private ChineseToPinyinResource() {

         * Initialize a hash-table contains <Unicode, HanyuPinyin> pairs
        private void initializeResource() {
            try {
                final String resourceName = "/pinyindb/unicode_to_hanyu_pinyin.txt";
                final String resourceMultiName = "/pinyindb/multi_pinyin.txt";

                setUnicodeToHanyuPinyinTable(new Trie());


 				// 新增classpath下拼音扩展库
                if (StringUtils.hasLength(MULTI_PINYIN_APPENDER)) {
                    ClassPathResource pathResource = new ClassPathResource(MULTI_PINYIN_APPENDER);
                    if (pathResource.exists()) {

				// 原始拼音扩展库,仅支持绝对路径

            } catch (FileNotFoundException ex) {
            } catch (IOException ex) {

        Trie getHanyuPinyinTrie(char ch) {

            String codepointHexStr = Integer.toHexString((int) ch).toUpperCase();

            // fetch from hashtable
            return getUnicodeToHanyuPinyinTable().get(codepointHexStr);

         * Get the unformatted Hanyu Pinyin representations of the given Chinese
         * character in array format.
         * @param ch given Chinese character in Unicode
         * @return The Hanyu Pinyin strings of the given Chinese character in array
         * format; return null if there is no corresponding Pinyin string.
        String[] getHanyuPinyinStringArray(char ch) {
            String pinyinRecord = getHanyuPinyinRecordFromChar(ch);
            return parsePinyinString(pinyinRecord);

        String[] parsePinyinString(String pinyinRecord) {

            if (null != pinyinRecord) {
                int indexOfLeftBracket = pinyinRecord.indexOf(Field.LEFT_BRACKET);
                int indexOfRightBracket = pinyinRecord.lastIndexOf(Field.RIGHT_BRACKET);

                String stripedString =
                        pinyinRecord.substring(indexOfLeftBracket + Field.LEFT_BRACKET.length(),

                return stripedString.split(Field.COMMA);

            } else {
                // no record found or mal-formatted record
                return null;

         * @param record given record string of Hanyu Pinyin
         * @return return true if record is not null and record is not "none0" and
         * record is not mal-formatted, else return false
        private boolean isValidRecord(String record) {
            final String noneStr = "(none0)";

            return (null != record) && !record.equals(noneStr) && record.startsWith(Field.LEFT_BRACKET)
                    && record.endsWith(Field.RIGHT_BRACKET);

         * @param ch given Chinese character in Unicode
         * @return corresponding Hanyu Pinyin Record in Properties file; null if no
         * record found
        private String getHanyuPinyinRecordFromChar(char ch) {
            // convert Chinese character to code point (integer)
            // please refer to http://www.unicode.org/glossary/#code_point
            // Another reference: http://en.wikipedia.org/wiki/Unicode
            int codePointOfChar = ch;

            String codepointHexStr = Integer.toHexString(codePointOfChar).toUpperCase();

            // fetch from hashtable
            Trie trie = getUnicodeToHanyuPinyinTable().get(codepointHexStr);
            String foundRecord = null;
            if (trie != null) {
                foundRecord = trie.getPinyin();

            return isValidRecord(foundRecord) ? foundRecord : null;

         * Singleton factory method.
         * @return the one and only MySingleton.
        static ChineseToPinyinResource getInstance() {
            return ChineseToPinyinResourceHolder.THE_INSTANCE;

         * Singleton implementation helper.
        private static class ChineseToPinyinResourceHolder {
            static final ChineseToPinyinResource THE_INSTANCE = new ChineseToPinyinResource();

         * A class encloses common string constants used in Properties files
         * @author Li Min (xmlerlimin@gmail.com)
        class Field {
            static final String LEFT_BRACKET = "(";

            static final String RIGHT_BRACKET = ")";

            static final String COMMA = ",";


    static class ResourceHelper {

         * @param resourceName
         * @return resource (mainly file in file system or file in compressed
         * package) as BufferedInputStream
        static BufferedInputStream getResourceInputStream(String resourceName) {
            return new BufferedInputStream(ResourceHelper.class.getResourceAsStream(resourceName));

下面是使用方式: 里面用到了google的guava包的部分内容

import com.google.common.base.Splitter;
import com.google.common.collect.Lists;
import net.sourceforge.pinyin4j.format.HanyuPinyinOutputFormat;
import net.sourceforge.pinyin4j.format.HanyuPinyinToneType;
import net.sourceforge.pinyin4j.format.HanyuPinyinVCharType;
import net.sourceforge.pinyin4j.format.exception.BadHanyuPinyinOutputFormatCombination;

import java.util.List;

 * 拼音工具类
public class PinyinUtil {

    private static HanyuPinyinOutputFormat outputFormat;
    private static final String SEPARATE = "#";

    static {
        outputFormat = new HanyuPinyinOutputFormat();
     * 获取文本的拼音
     * @param str     需要转换拼音的文本
     * @param retain  true:保留中文以外的其他字符
     * @param initial true:只需要首字母
     * @return 拼音
    public static String toPinYinString(String str, boolean retain, boolean initial) {
        StringBuilder sb = new StringBuilder();
        try {
            List<String> list = Lists.newArrayList();
            StringBuilder notChinese = new StringBuilder();
            for (int i = 0; i < str.length(); i++) {
                if (str.charAt(i) < 0x4E00 || str.charAt(i) > 0x9FA5) {
                    if (i == str.length() - 1) {
                } else {
                    if (notChinese.length() > 0) {
                        notChinese = new StringBuilder();
            String pinyin = PinyinHelper.toHanYuPinyinString(str, outputFormat, SEPARATE, retain);
            Splitter.on(SEPARATE).split(pinyin).forEach(py -> {
                if (list.contains(py)) {
                if (initial) {
                	if (py.length() > 0) {
                } else {
        } catch (BadHanyuPinyinOutputFormatCombination e) {
        return sb.toString();


		String str = "成长,重启,重量,长大了,角色,角落,呼啦啦,1我2,3爱4,5你6";
        System.out.println(PinyinUtil.toPinYinString(str, true, true));
        // cz,cq,zl,zdl,js,jl,hll,1w2,3a4,5n6
        System.out.println(PinyinUtil.toPinYinString(str, false, true));
        // czcqzlzdljsjlhllwan
        System.out.println(PinyinUtil.toPinYinString(str, true, false));
        // chengzhang,chongqi,zhongliang,zhangdale,juese,jiaoluo,hulala,1wo2,3ai4,5ni6
        System.out.println(PinyinUtil.toPinYinString(str, false, false));
        // chengzhangchongqizhongliangzhangdalejuesejiaoluohulalawoaini