java icu4j_java – 如何使用ICU4J库

空成天

2023-12-01

嗨Adham我在ICU4J的经验很少我试图读取LTR阿拉伯语文本并将其转换为RTL文本

我将数字从英文更改为阿拉伯数字,并将对齐设置为RTL这是一个简单的代码,我希望我的小经验可以帮助你这是ICU4J网站中的

demos

PdfReader reader = new PdfReader(INPUTFILE);

String txt=PdfTextExtractor.getTextFromPage(reader, 1);

BiDiClass bidiClass = new BiDiClass();

String arabicNumber = bidiClass.englishToArabicNumber(txt);

String out=bidiClass.makeLineLogicalOrder(arabicNumber, true);

System.out.println(out);

这就是BiDiClass

import com.ibm.icu.text.Bidi;

import com.ibm.icu.text.Normalizer;

//Editor : Ibraheem Osama Mohamed

/**

* This class is an implementation the the ICU4J class. TextNormalize

* will call this only if the ICU4J library exists in the classpath.

* @author Brian Carrier

* @version $Revision: 1.0 $

public class BiDiClass {

private static final String REPLACE_CHARS = "0123456789.";

private Bidi bidi;

private StringBuilder sb = new StringBuilder();

/**

* Constructor.

public BiDiClass()

{

bidi = new Bidi();

/* We do not use bidi.setInverse() because that uses

* Bidi.REORDER_INVERSE_NUMBERS_AS_L, which caused problems

* in some test files. For example, a file had a line of:

* 0 1 / ARABIC

* and the 0 and 1 were reversed in the end result.

* REORDER_INVERSE_LIKE_DIRECT is the inverse Bidi mode

* that more closely reflects the Unicode spec.

bidi.setReorderingMode(Bidi.REORDER_INVERSE_LIKE_DIRECT);

}

/**

* Takes a line of text in presentation order and converts it to logical order.

* @see TextNormalize.makeLineLogicalOrder(String, boolean)

* @param str String to convert

* @param isRtlDominant RTL (right-to-left) will be the dominant text direction

* @return The converted string

public String makeLineLogicalOrder(String str, boolean isRtlDominant)

{

bidi.setPara(str, isRtlDominant?Bidi.RTL:Bidi.LTR, null);

/* Set the mirror flag so that parentheses and other mirror symbols

* are properly reversed, when needed. With this removed, lines

* such as (CBA) in the PDF file will come out like )ABC( in logical

* order.

return bidi.writeReordered(Bidi.DO_MIRRORING);

}

//algorithm to change form English number to Arabic number

public String englishToArabicNumber(String string){

char[] ch=string.toCharArray();

for (char c : ch) {

if (REPLACE_CHARS.contains(String.valueOf(c))) {

c = (char) ('\u0660' - '0' + c);

}

sb.append(c);

}

return sb.toString();

}

/**

* Normalize presentation forms of characters to the separate parts.

* @see TextNormalize.normalizePres(String)

* @param str String to normalize

* @return Normalized form

public String normalizePres(String str)

{

StringBuilder builder = null;

int p = 0;

int q = 0;

int strLength = str.length();

for (; q < strLength; q++) /* >>>*/

{

// We only normalize if the codepoint is in a given range.

// Otherwise, NFKC converts too many things that would cause

// confusion. For example, it converts the micro symbol in

// extended Latin to the value in the Greek script. We normalize

// the Unicode Alphabetic and Arabic A&B Presentation forms.

char c = str.charAt(q);

if ((0xFB00 <= c && c <= 0xFDFF) || (0xFE70 <= c && c <= 0xFEFF))/* >>>*/

{

if (builder == null) {

builder = new StringBuilder(strLength * 2);

}

builder.append(str.substring(p, q));

// Some fonts map U+FDF2 differently than the Unicode spec.

// They add an extra U+0627 character to compensate.

// This removes the extra character for those fonts.

if(c == 0xFDF2 && q > 0 && (str.charAt(q-1) == 0x0627 || str.charAt(q-1) == 0xFE8D))

{

builder.append("\u0644\u0644\u0647");

}

else

{

// Trim because some decompositions have an extra space,

// such as U+FC5E

builder.append(

Normalizer.normalize(c, Normalizer.NFKC).trim());

}

p = q + 1;

}

if (builder == null) {

return str;

} else {

builder.append(str.substring(p, q));

return builder.toString();

}

/**

* Decomposes Diacritic characters to their combining forms.

* @param str String to be Normalized

* @return A Normalized String

public String normalizeDiac(String str)

{

StringBuilder retStr = new StringBuilder();

int strLength = str.length();

for (int i = 0; i < strLength; i++) /* >>>*/

{

char c = str.charAt(i);

if(Character.getType(c) == Character.NON_SPACING_MARK

|| Character.getType(c) == Character.MODIFIER_SYMBOL

|| Character.getType(c) == Character.MODIFIER_LETTER)

{

* Trim because some decompositions have an extra space, such as

* U+00B4

retStr.append(Normalizer.normalize(c, Normalizer.NFKC).trim());

}

else

{

retStr.append(str.charAt(i));

}

return retStr.toString();

}

java icu4j_java – 如何使用ICU4J库

相关阅读

相关文章

相关问答

相关文档