ICU 69.1 69.1
normalizer2.h
Go to the documentation of this file.
1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4*******************************************************************************
5*
6* Copyright (C) 2009-2013, International Business Machines
7* Corporation and others. All Rights Reserved.
8*
9*******************************************************************************
10* file name: normalizer2.h
11* encoding: UTF-8
12* tab size: 8 (not used)
13* indentation:4
14*
15* created on: 2009nov22
16* created by: Markus W. Scherer
17*/
18
19#ifndef __NORMALIZER2_H__
20#define __NORMALIZER2_H__
21
27#include "unicode/utypes.h"
28
29#if U_SHOW_CPLUSPLUS_API
30
31#if !UCONFIG_NO_NORMALIZATION
32
33#include "unicode/stringpiece.h"
34#include "unicode/uniset.h"
35#include "unicode/unistr.h"
36#include "unicode/unorm2.h"
37
38U_NAMESPACE_BEGIN
39
40class ByteSink;
41
86public:
92
104 static const Normalizer2 *
106
118 static const Normalizer2 *
120
132 static const Normalizer2 *
134
146 static const Normalizer2 *
148
160 static const Normalizer2 *
162
184 static const Normalizer2 *
186 const char *name,
188 UErrorCode &errorCode);
189
201 normalize(const UnicodeString &src, UErrorCode &errorCode) const {
203 normalize(src, result, errorCode);
204 return result;
205 }
219 virtual UnicodeString &
222 UErrorCode &errorCode) const = 0;
223
246 virtual void
248 Edits *edits, UErrorCode &errorCode) const;
249
264 virtual UnicodeString &
266 const UnicodeString &second,
267 UErrorCode &errorCode) const = 0;
282 virtual UnicodeString &
284 const UnicodeString &second,
285 UErrorCode &errorCode) const = 0;
286
300 virtual UBool
302
327 virtual UBool
329
345 virtual UChar32
347
356 virtual uint8_t
358
373 virtual UBool
374 isNormalized(const UnicodeString &s, UErrorCode &errorCode) const = 0;
394 virtual UBool
396
397
414 quickCheck(const UnicodeString &s, UErrorCode &errorCode) const = 0;
415
438 virtual int32_t
439 spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const = 0;
440
454 virtual UBool hasBoundaryBefore(UChar32 c) const = 0;
455
470 virtual UBool hasBoundaryAfter(UChar32 c) const = 0;
471
485 virtual UBool isInert(UChar32 c) const = 0;
486};
487
500public:
512 norm2(n2), set(filterSet) {}
513
519
533 virtual UnicodeString &
536 UErrorCode &errorCode) const U_OVERRIDE;
537
560 virtual void
562 Edits *edits, UErrorCode &errorCode) const U_OVERRIDE;
563
578 virtual UnicodeString &
580 const UnicodeString &second,
581 UErrorCode &errorCode) const U_OVERRIDE;
596 virtual UnicodeString &
598 const UnicodeString &second,
599 UErrorCode &errorCode) const U_OVERRIDE;
600
612 virtual UBool
614
626 virtual UBool
628
639 virtual UChar32
641
650 virtual uint8_t
652
664 virtual UBool
665 isNormalized(const UnicodeString &s, UErrorCode &errorCode) const U_OVERRIDE;
685 virtual UBool
699 quickCheck(const UnicodeString &s, UErrorCode &errorCode) const U_OVERRIDE;
711 virtual int32_t
713
723
733
741 virtual UBool isInert(UChar32 c) const U_OVERRIDE;
742private:
744 normalize(const UnicodeString &src,
747 UErrorCode &errorCode) const;
748
749 void
750 normalizeUTF8(uint32_t options, const char *src, int32_t length,
753 UErrorCode &errorCode) const;
754
756 normalizeSecondAndAppend(UnicodeString &first,
757 const UnicodeString &second,
759 UErrorCode &errorCode) const;
760
761 const Normalizer2 &norm2;
762 const UnicodeSet &set;
763};
764
766
767#endif // !UCONFIG_NO_NORMALIZATION
768
769#endif /* U_SHOW_CPLUSPLUS_API */
770
771#endif // __NORMALIZER2_H__
A ByteSink can be filled with bytes.
Definition bytestream.h:53
Records lengths of string edits but not replacement text.
Definition edits.h:80
Normalization filtered by a UnicodeSet.
virtual UnicodeString & normalize(const UnicodeString &src, UnicodeString &dest, UErrorCode &errorCode) const U_OVERRIDE
Writes the normalized form of the source string to the destination string (replacing its contents) an...
virtual UBool isInert(UChar32 c) const U_OVERRIDE
Tests if the character is normalization-inert.
virtual UBool getRawDecomposition(UChar32 c, UnicodeString &decomposition) const U_OVERRIDE
Gets the raw decomposition mapping of c.
~FilteredNormalizer2()
Destructor.
virtual void normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink, Edits *edits, UErrorCode &errorCode) const U_OVERRIDE
Normalizes a UTF-8 string and optionally records how source substrings relate to changed and unchange...
virtual UBool isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const U_OVERRIDE
Tests if the UTF-8 string is normalized.
virtual UBool hasBoundaryBefore(UChar32 c) const U_OVERRIDE
Tests if the character always has a normalization boundary before it, regardless of context.
virtual UBool getDecomposition(UChar32 c, UnicodeString &decomposition) const U_OVERRIDE
Gets the decomposition mapping of c.
virtual int32_t spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const U_OVERRIDE
Returns the end of the normalized substring of the input string.
virtual uint8_t getCombiningClass(UChar32 c) const U_OVERRIDE
Gets the combining class of c.
virtual UChar32 composePair(UChar32 a, UChar32 b) const U_OVERRIDE
Performs pairwise composition of a & b and returns the composite if there is one.
virtual UBool hasBoundaryAfter(UChar32 c) const U_OVERRIDE
Tests if the character always has a normalization boundary after it, regardless of context.
virtual UNormalizationCheckResult quickCheck(const UnicodeString &s, UErrorCode &errorCode) const U_OVERRIDE
Tests if the string is normalized.
virtual UBool isNormalized(const UnicodeString &s, UErrorCode &errorCode) const U_OVERRIDE
Tests if the string is normalized.
FilteredNormalizer2(const Normalizer2 &n2, const UnicodeSet &filterSet)
Constructs a filtered normalizer wrapping any Normalizer2 instance and a filter set.
virtual UnicodeString & normalizeSecondAndAppend(UnicodeString &first, const UnicodeString &second, UErrorCode &errorCode) const U_OVERRIDE
Appends the normalized form of the second string to the first string (merging them at the boundary) a...
virtual UnicodeString & append(UnicodeString &first, const UnicodeString &second, UErrorCode &errorCode) const U_OVERRIDE
Appends the second string to the first string (merging them at the boundary) and returns the first st...
"Smart pointer" base class; do not use directly: use LocalPointer etc.
Unicode normalization functionality for standard Unicode normalization or for using custom mapping ta...
Definition normalizer2.h:85
static const Normalizer2 * getNFDInstance(UErrorCode &errorCode)
Returns a Normalizer2 instance for Unicode NFD normalization.
virtual UBool getDecomposition(UChar32 c, UnicodeString &decomposition) const =0
Gets the decomposition mapping of c.
virtual int32_t spanQuickCheckYes(const UnicodeString &s, UErrorCode &errorCode) const =0
Returns the end of the normalized substring of the input string.
virtual UBool isNormalized(const UnicodeString &s, UErrorCode &errorCode) const =0
Tests if the string is normalized.
virtual UBool hasBoundaryBefore(UChar32 c) const =0
Tests if the character always has a normalization boundary before it, regardless of context.
~Normalizer2()
Destructor.
virtual UChar32 composePair(UChar32 a, UChar32 b) const
Performs pairwise composition of a & b and returns the composite if there is one.
static const Normalizer2 * getInstance(const char *packageName, const char *name, UNormalization2Mode mode, UErrorCode &errorCode)
Returns a Normalizer2 instance which uses the specified data file (packageName/name similar to ucnv_o...
virtual UnicodeString & append(UnicodeString &first, const UnicodeString &second, UErrorCode &errorCode) const =0
Appends the second string to the first string (merging them at the boundary) and returns the first st...
static const Normalizer2 * getNFKDInstance(UErrorCode &errorCode)
Returns a Normalizer2 instance for Unicode NFKD normalization.
virtual UnicodeString & normalize(const UnicodeString &src, UnicodeString &dest, UErrorCode &errorCode) const =0
Writes the normalized form of the source string to the destination string (replacing its contents) an...
static const Normalizer2 * getNFKCCasefoldInstance(UErrorCode &errorCode)
Returns a Normalizer2 instance for Unicode NFKC_Casefold normalization.
UnicodeString normalize(const UnicodeString &src, UErrorCode &errorCode) const
Returns the normalized form of the source string.
virtual UNormalizationCheckResult quickCheck(const UnicodeString &s, UErrorCode &errorCode) const =0
Tests if the string is normalized.
static const Normalizer2 * getNFCInstance(UErrorCode &errorCode)
Returns a Normalizer2 instance for Unicode NFC normalization.
virtual UBool hasBoundaryAfter(UChar32 c) const =0
Tests if the character always has a normalization boundary after it, regardless of context.
virtual uint8_t getCombiningClass(UChar32 c) const
Gets the combining class of c.
virtual UBool isInert(UChar32 c) const =0
Tests if the character is normalization-inert.
static const Normalizer2 * getNFKCInstance(UErrorCode &errorCode)
Returns a Normalizer2 instance for Unicode NFKC normalization.
virtual void normalizeUTF8(uint32_t options, StringPiece src, ByteSink &sink, Edits *edits, UErrorCode &errorCode) const
Normalizes a UTF-8 string and optionally records how source substrings relate to changed and unchange...
virtual UBool isNormalizedUTF8(StringPiece s, UErrorCode &errorCode) const
Tests if the UTF-8 string is normalized.
virtual UBool getRawDecomposition(UChar32 c, UnicodeString &decomposition) const
Gets the raw decomposition mapping of c.
virtual UnicodeString & normalizeSecondAndAppend(UnicodeString &first, const UnicodeString &second, UErrorCode &errorCode) const =0
Appends the normalized form of the second string to the first string (merging them at the boundary) a...
A string-like object that points to a sized piece of memory.
Definition stringpiece.h:60
UObject is the common ICU "boilerplate" class.
Definition uobject.h:223
A mutable set of Unicode characters and multicharacter strings.
Definition uniset.h:279
UnicodeString is a string class that stores Unicode characters directly and provides similar function...
Definition unistr.h:296
C++ API: StringPiece: Read-only byte string wrapper class.
int32_t UChar32
Define UChar32 as a type for single Unicode code points.
Definition umachine.h:467
int8_t UBool
The ICU boolean type, a signed-byte integer.
Definition umachine.h:269
#define U_OVERRIDE
Defined to the C++11 "override" keyword if available.
Definition umachine.h:130
C++ API: Unicode Set.
C++ API: Unicode String.
C API: New API for Unicode Normalization.
UNormalizationCheckResult
Result values for normalization quick check functions.
Definition unorm2.h:97
UNormalization2Mode
Constants for normalization modes.
Definition unorm2.h:48
USetSpanCondition
Argument values for whether span() and similar functions continue while the current character is cont...
Definition uset.h:159
Basic definitions for ICU, for both C and C++ APIs.
UErrorCode
Standard ICU4C error code type, a substitute for exceptions.
Definition utypes.h:415
#define U_COMMON_API
Set to export library symbols from inside the common library, and to import them from outside.
Definition utypes.h:300