如何在日语 utf-8 汉字上使用 java 子串
how to use java substring on Japanese utf-8 kanji
是否可以使用子字符串从字符串中提取单个 utf8 汉字?问题是 utf-8 "characters" 的长度可以是 1、2 或 3。
例如,"" 的长度是 6,所以 String.substring(1, 2) 没有得到第一个完整的字符。
在 PERL 中,我可以只使用 substr("", 1, 1) 获取第一个字符,或使用 substr("", 2, 1) 获取第二个字符。
更新:
根据@msandiford 的建议,我想到了这个。
public class SplitKanji {
private String [] splitKanji;
private SplitKanji(String string) {
int cpCount = string.codePointCount(0, string.length());
splitKanji = new String[cpCount];
int nextSlot = 0;
for (int i = 0; i < string.length();) {
int ii = string.offsetByCodePoints(i, 1);
splitKanji[nextSlot++] = string.substring(i, ii);
i = ii;
}
}
private String[] get() {
return splitKanji;
}
public static void main(String[] args) {
String startKanji = "私今日";
SplitKanji myStuff = new SplitKanji(startKanji);
String [] split = myStuff.get();
System.out.print(startKanji + "=");
for(String kanji: split)
System.out.print(kanji + ":" + kanji.length() + ", ");
System.out.println();
}
}
您可以像这样从 String
中提取单个 Unicode 代码点:
public static final String KANJI = "";
public static void main(String[] args)
{
System.out.println(KANJI.length()); // 6
System.out.println(KANJI.codePointCount(0, KANJI.length()));// 3
// Loop over each code point
for (int i = 0; i < KANJI.length(); )
{
System.out.println(KANJI.codePointAt(i));
i = KANJI.offsetByCodePoints(i, 1);
}
// Extract the third codepoint
int indexForThirdCodePoint = KANJI.offsetByCodePoints(0, 2);
int thirdCodePoint = KANJI.codePointAt(indexForThirdCodePoint);
System.out.println(thirdCodePoint);
// Convert codepoint back to string
System.out.println(new String(Character.toChars(thirdCodePoint)));
}
您可以使用上述技巧获取您需要的代码点的开始和结束索引,然后使用substring(start, end)
提取。
(edit) 所有这些都可以通过一些明智的重构和实用函数来简化。下面是一个可能的例子;我不知道您的代码的用例是什么,所以很难知道什么最适合您。
public static final String KANJI = "";
public static int lengthCodepoints(String s)
{
return s.codePointCount(0, s.length());
}
public static String substringCodepoint(String s, int startCodepoint, int numCodepoints)
{
int startIndex = s.offsetByCodePoints(0, startCodepoint);
int endIndex = s.offsetByCodePoints(startIndex, numCodepoints);
return s.substring(startIndex, endIndex);
}
public static void main(String[] args)
{
int cpLength = lengthCodepoints(KANJI);
for (int i = 0; i < cpLength; ++i)
{
System.out.println(substringCodepoint(KANJI, i, 1));
}
}
是否可以使用子字符串从字符串中提取单个 utf8 汉字?问题是 utf-8 "characters" 的长度可以是 1、2 或 3。
例如,"" 的长度是 6,所以 String.substring(1, 2) 没有得到第一个完整的字符。
在 PERL 中,我可以只使用 substr("", 1, 1) 获取第一个字符,或使用 substr("", 2, 1) 获取第二个字符。
更新: 根据@msandiford 的建议,我想到了这个。
public class SplitKanji {
private String [] splitKanji;
private SplitKanji(String string) {
int cpCount = string.codePointCount(0, string.length());
splitKanji = new String[cpCount];
int nextSlot = 0;
for (int i = 0; i < string.length();) {
int ii = string.offsetByCodePoints(i, 1);
splitKanji[nextSlot++] = string.substring(i, ii);
i = ii;
}
}
private String[] get() {
return splitKanji;
}
public static void main(String[] args) {
String startKanji = "私今日";
SplitKanji myStuff = new SplitKanji(startKanji);
String [] split = myStuff.get();
System.out.print(startKanji + "=");
for(String kanji: split)
System.out.print(kanji + ":" + kanji.length() + ", ");
System.out.println();
}
}
您可以像这样从 String
中提取单个 Unicode 代码点:
public static final String KANJI = "";
public static void main(String[] args)
{
System.out.println(KANJI.length()); // 6
System.out.println(KANJI.codePointCount(0, KANJI.length()));// 3
// Loop over each code point
for (int i = 0; i < KANJI.length(); )
{
System.out.println(KANJI.codePointAt(i));
i = KANJI.offsetByCodePoints(i, 1);
}
// Extract the third codepoint
int indexForThirdCodePoint = KANJI.offsetByCodePoints(0, 2);
int thirdCodePoint = KANJI.codePointAt(indexForThirdCodePoint);
System.out.println(thirdCodePoint);
// Convert codepoint back to string
System.out.println(new String(Character.toChars(thirdCodePoint)));
}
您可以使用上述技巧获取您需要的代码点的开始和结束索引,然后使用substring(start, end)
提取。
(edit) 所有这些都可以通过一些明智的重构和实用函数来简化。下面是一个可能的例子;我不知道您的代码的用例是什么,所以很难知道什么最适合您。
public static final String KANJI = "";
public static int lengthCodepoints(String s)
{
return s.codePointCount(0, s.length());
}
public static String substringCodepoint(String s, int startCodepoint, int numCodepoints)
{
int startIndex = s.offsetByCodePoints(0, startCodepoint);
int endIndex = s.offsetByCodePoints(startIndex, numCodepoints);
return s.substring(startIndex, endIndex);
}
public static void main(String[] args)
{
int cpLength = lengthCodepoints(KANJI);
for (int i = 0; i < cpLength; ++i)
{
System.out.println(substringCodepoint(KANJI, i, 1));
}
}