ICU 69.1 69.1
uniset.h
Go to the documentation of this file.
1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4***************************************************************************
5* Copyright (C) 1999-2016, International Business Machines Corporation
6* and others. All Rights Reserved.
7***************************************************************************
8* Date Name Description
9* 10/20/99 alan Creation.
10***************************************************************************
11*/
12
13#ifndef UNICODESET_H
14#define UNICODESET_H
15
16#include "unicode/utypes.h"
17
18#if U_SHOW_CPLUSPLUS_API
19
20#include "unicode/ucpmap.h"
21#include "unicode/unifilt.h"
22#include "unicode/unistr.h"
23#include "unicode/uset.h"
24
30U_NAMESPACE_BEGIN
31
32// Forward Declarations.
33class BMPSet;
34class ParsePosition;
35class RBBIRuleScanner;
36class SymbolTable;
37class UnicodeSetStringSpan;
38class UVector;
39class RuleCharacterIterator;
40
280private:
285 static constexpr int32_t INITIAL_CAPACITY = 25;
286 // fFlags constant
287 static constexpr uint8_t kIsBogus = 1; // This set is bogus (i.e. not valid)
288
289 UChar32* list = stackList; // MUST be terminated with HIGH
290 int32_t capacity = INITIAL_CAPACITY; // capacity of list
291 int32_t len = 1; // length of list used; 1 <= len <= capacity
292 uint8_t fFlags = 0; // Bit flag (see constants above)
293
294 BMPSet *bmpSet = nullptr; // The set is frozen iff either bmpSet or stringSpan is not NULL.
295 UChar32* buffer = nullptr; // internal buffer, may be NULL
296 int32_t bufferCapacity = 0; // capacity of buffer
297
307 char16_t *pat = nullptr;
308 int32_t patLen = 0;
309
310 UVector* strings = nullptr; // maintained in sorted order
311 UnicodeSetStringSpan *stringSpan = nullptr;
312
318 UChar32 stackList[INITIAL_CAPACITY];
319
320public:
330 inline UBool isBogus(void) const;
331
349
350public:
351
352 enum {
357 MIN_VALUE = 0,
358
363 MAX_VALUE = 0x10ffff
364 };
365
366 //----------------------------------------------------------------
367 // Constructors &c
368 //----------------------------------------------------------------
369
370public:
371
377
387
388#ifndef U_HIDE_INTERNAL_API
393 kSerialized /* result of serialize() */
394 };
395
408#endif /* U_HIDE_INTERNAL_API */
409
418 UnicodeSet(const UnicodeString& pattern,
420
421#ifndef U_HIDE_INTERNAL_API
434 UnicodeSet(const UnicodeString& pattern,
436 const SymbolTable* symbols,
438#endif /* U_HIDE_INTERNAL_API */
439
455 const SymbolTable* symbols,
457
463
468 virtual ~UnicodeSet();
469
476
488 virtual UBool operator==(const UnicodeSet& o) const;
489
495 inline UBool operator!=(const UnicodeSet& o) const;
496
506 virtual UnicodeSet* clone() const;
507
515 virtual int32_t hashCode(void) const;
516
525 inline static UnicodeSet *fromUSet(USet *uset);
526
535 inline static const UnicodeSet *fromUSet(const USet *uset);
536
544 inline USet *toUSet();
545
546
554 inline const USet * toUSet() const;
555
556
557 //----------------------------------------------------------------
558 // Freezable API
559 //----------------------------------------------------------------
560
569 inline UBool isFrozen() const;
570
585
595
596 //----------------------------------------------------------------
597 // Public API
598 //----------------------------------------------------------------
599
610
616 static UBool resemblesPattern(const UnicodeString& pattern,
617 int32_t pos);
618
633
634#ifndef U_HIDE_INTERNAL_API
653 const SymbolTable* symbols,
655#endif /* U_HIDE_INTERNAL_API */
656
689 ParsePosition& pos,
691 const SymbolTable* symbols,
693
708 UBool escapeUnprintable = false) const;
709
733 int32_t value,
734 UErrorCode& ec);
735
766 const UnicodeString& value,
767 UErrorCode& ec);
768
777 virtual int32_t size(void) const;
778
785 virtual UBool isEmpty(void) const;
786
794 virtual UBool contains(UChar32 c) const;
795
804 virtual UBool contains(UChar32 start, UChar32 end) const;
805
813 UBool contains(const UnicodeString& s) const;
814
822 virtual UBool containsAll(const UnicodeSet& c) const;
823
832
842
851
860
869 inline UBool containsSome(UChar32 start, UChar32 end) const;
870
878 inline UBool containsSome(const UnicodeSet& s) const;
879
887 inline UBool containsSome(const UnicodeString& s) const;
888
907 int32_t span(const char16_t *s, int32_t length, USetSpanCondition spanCondition) const;
908
921 inline int32_t span(const UnicodeString &s, int32_t start, USetSpanCondition spanCondition) const;
922
940 int32_t spanBack(const char16_t *s, int32_t length, USetSpanCondition spanCondition) const;
941
955 inline int32_t spanBack(const UnicodeString &s, int32_t limit, USetSpanCondition spanCondition) const;
956
976
995
1000 virtual UMatchDegree matches(const Replaceable& text,
1001 int32_t& offset,
1002 int32_t limit,
1004
1005private:
1028 static int32_t matchRest(const Replaceable& text,
1029 int32_t start, int32_t limit,
1030 const UnicodeString& s);
1031
1041 int32_t findCodePoint(UChar32 c) const;
1042
1043public:
1044
1052 virtual void addMatchSetTo(UnicodeSet& toUnionTo) const;
1053
1063
1073 UChar32 charAt(int32_t index) const;
1074
1089 virtual UnicodeSet& add(UChar32 start, UChar32 end);
1090
1102
1115
1116 private:
1122 static int32_t getSingleCP(const UnicodeString& s);
1123
1124 void _add(const UnicodeString& s);
1125
1126 public:
1136
1145
1154
1163
1173
1174
1183
1195 virtual UnicodeSet& retain(UChar32 start, UChar32 end);
1196
1197
1207
1208#ifndef U_HIDE_DRAFT_API
1220#endif // U_HIDE_DRAFT_API
1221
1235 virtual UnicodeSet& remove(UChar32 start, UChar32 end);
1236
1248
1259
1267 virtual UnicodeSet& complement(void);
1268
1281 virtual UnicodeSet& complement(UChar32 start, UChar32 end);
1282
1294
1305
1318 virtual UnicodeSet& addAll(const UnicodeSet& c);
1319
1331 virtual UnicodeSet& retainAll(const UnicodeSet& c);
1332
1344 virtual UnicodeSet& removeAll(const UnicodeSet& c);
1345
1357
1364 virtual UnicodeSet& clear(void);
1365
1392
1400
1408 virtual int32_t getRangeCount(void) const;
1409
1417 virtual UChar32 getRangeStart(int32_t index) const;
1418
1426 virtual UChar32 getRangeEnd(int32_t index) const;
1427
1477
1485
1498
1507 virtual UClassID getDynamicClassID(void) const;
1508
1509private:
1510
1511 // Private API for the USet API
1512
1513 friend class USetAccess;
1514
1515 const UnicodeString* getString(int32_t index) const;
1516
1517 //----------------------------------------------------------------
1518 // RuleBasedTransliterator support
1519 //----------------------------------------------------------------
1520
1521private:
1522
1528 virtual UBool matchesIndexValue(uint8_t v) const;
1529
1530private:
1531 friend class RBBIRuleScanner;
1532
1533 //----------------------------------------------------------------
1534 // Implementation: Clone as thawed (see ICU4J Freezable)
1535 //----------------------------------------------------------------
1536
1537 UnicodeSet(const UnicodeSet& o, UBool /* asThawed */);
1538 UnicodeSet& copyFrom(const UnicodeSet& o, UBool asThawed);
1539
1540 //----------------------------------------------------------------
1541 // Implementation: Pattern parsing
1542 //----------------------------------------------------------------
1543
1544 void applyPatternIgnoreSpace(const UnicodeString& pattern,
1545 ParsePosition& pos,
1546 const SymbolTable* symbols,
1548
1549 void applyPattern(RuleCharacterIterator& chars,
1550 const SymbolTable* symbols,
1554 int32_t depth,
1555 UErrorCode& ec);
1556
1557 //----------------------------------------------------------------
1558 // Implementation: Utility methods
1559 //----------------------------------------------------------------
1560
1561 static int32_t nextCapacity(int32_t minCapacity);
1562
1563 bool ensureCapacity(int32_t newLen);
1564
1565 bool ensureBufferCapacity(int32_t newLen);
1566
1567 void swapBuffers(void);
1568
1569 UBool allocateStrings(UErrorCode &status);
1570 UBool hasStrings() const;
1571 int32_t stringsSize() const;
1572 UBool stringsContains(const UnicodeString &s) const;
1573
1574 UnicodeString& _toPattern(UnicodeString& result,
1575 UBool escapeUnprintable) const;
1576
1577 UnicodeString& _generatePattern(UnicodeString& result,
1578 UBool escapeUnprintable) const;
1579
1580 static void _appendToPat(UnicodeString& buf, const UnicodeString& s, UBool escapeUnprintable);
1581
1582 static void _appendToPat(UnicodeString& buf, UChar32 c, UBool escapeUnprintable);
1583
1584 //----------------------------------------------------------------
1585 // Implementation: Fundamental operators
1586 //----------------------------------------------------------------
1587
1588 void exclusiveOr(const UChar32* other, int32_t otherLen, int8_t polarity);
1589
1590 void add(const UChar32* other, int32_t otherLen, int8_t polarity);
1591
1592 void retain(const UChar32* other, int32_t otherLen, int8_t polarity);
1593
1599 static UBool resemblesPropertyPattern(const UnicodeString& pattern,
1600 int32_t pos);
1601
1602 static UBool resemblesPropertyPattern(RuleCharacterIterator& chars,
1604
1644 UnicodeSet& applyPropertyPattern(const UnicodeString& pattern,
1646 UErrorCode &ec);
1647
1648 void applyPropertyPattern(RuleCharacterIterator& chars,
1650 UErrorCode& ec);
1651
1652 static const UnicodeSet* getInclusions(int32_t src, UErrorCode &status);
1653
1658 typedef UBool (*Filter)(UChar32 codePoint, void* context);
1659
1669 void applyFilter(Filter filter,
1670 void* context,
1671 const UnicodeSet* inclusions,
1673
1674 // UCPMap is now stable ICU 63
1675 void applyIntPropertyValue(const UCPMap *map,
1676 UCPMapValueFilter *filter, const void *context,
1677 UErrorCode &errorCode);
1678
1682 void setPattern(const UnicodeString& newPat) {
1683 setPattern(newPat.getBuffer(), newPat.length());
1684 }
1685 void setPattern(const char16_t *newPat, int32_t newPatLen);
1689 void releasePattern();
1690
1691 friend class UnicodeSetIterator;
1692};
1693
1694
1695
1696inline UBool UnicodeSet::operator!=(const UnicodeSet& o) const {
1697 return !operator==(o);
1698}
1699
1700inline UBool UnicodeSet::isFrozen() const {
1701 return (UBool)(bmpSet!=NULL || stringSpan!=NULL);
1702}
1703
1704inline UBool UnicodeSet::containsSome(UChar32 start, UChar32 end) const {
1705 return !containsNone(start, end);
1706}
1707
1708inline UBool UnicodeSet::containsSome(const UnicodeSet& s) const {
1709 return !containsNone(s);
1710}
1711
1712inline UBool UnicodeSet::containsSome(const UnicodeString& s) const {
1713 return !containsNone(s);
1714}
1715
1716inline UBool UnicodeSet::isBogus() const {
1717 return (UBool)(fFlags & kIsBogus);
1718}
1719
1720inline UnicodeSet *UnicodeSet::fromUSet(USet *uset) {
1721 return reinterpret_cast<UnicodeSet *>(uset);
1722}
1723
1724inline const UnicodeSet *UnicodeSet::fromUSet(const USet *uset) {
1725 return reinterpret_cast<const UnicodeSet *>(uset);
1726}
1727
1728inline USet *UnicodeSet::toUSet() {
1729 return reinterpret_cast<USet *>(this);
1730}
1731
1732inline const USet *UnicodeSet::toUSet() const {
1733 return reinterpret_cast<const USet *>(this);
1734}
1735
1736inline int32_t UnicodeSet::span(const UnicodeString &s, int32_t start, USetSpanCondition spanCondition) const {
1738 if(start<0) {
1739 start=0;
1740 } else if(start>sLength) {
1741 start=sLength;
1742 }
1743 return start+span(s.getBuffer()+start, sLength-start, spanCondition);
1744}
1745
1746inline int32_t UnicodeSet::spanBack(const UnicodeString &s, int32_t limit, USetSpanCondition spanCondition) const {
1748 if(limit<0) {
1749 limit=0;
1750 } else if(limit>sLength) {
1751 limit=sLength;
1752 }
1753 return spanBack(s.getBuffer(), limit, spanCondition);
1754}
1755
1757
1758#endif /* U_SHOW_CPLUSPLUS_API */
1759
1760#endif
#define INITIAL_CAPACITY
The initial size of an array if it is unspecified.
Definition RunArrays.h:32
"Smart pointer" base class; do not use directly: use LocalPointer etc.
ParsePosition is a simple class used by Format and its subclasses to keep track of the current positi...
Definition parsepos.h:52
Replaceable is an abstract base class representing a string of characters that supports the replaceme...
Definition rep.h:77
An interface that defines both lookup protocol and parsing of symbolic names.
Definition symtable.h:59
UnicodeFilter defines a protocol for selecting a subset of the full range (U+0000 to U+10FFFF) of Uni...
Definition unifilt.h:65
virtual UBool matchesIndexValue(uint8_t v) const =0
Returns true if this matcher will match a character c, where c & 0xFF == v, at offset,...
UnicodeSetIterator iterates over the contents of a UnicodeSet.
Definition usetiter.h:66
A mutable set of Unicode characters and multicharacter strings.
Definition uniset.h:279
UnicodeSet & operator=(const UnicodeSet &o)
Assigns this object to be a copy of another.
UnicodeSet & addAll(const UnicodeString &s)
Adds each of the characters in this string to the set.
virtual UChar32 getRangeEnd(int32_t index) const
Iteration method that returns the last character in the specified range of this set.
UnicodeSet()
Constructs an empty set.
int32_t spanBackUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const
Returns the start of the trailing substring of the input string which consists only of characters and...
UnicodeSet(UChar32 start, UChar32 end)
Constructs a set containing the given range.
static UnicodeSet * createFromAll(const UnicodeString &s)
Makes a set from each of the characters in the string.
int32_t spanUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const
Returns the length of the initial substring of the input string which consists only of characters and...
UnicodeSet & complementAll(const UnicodeString &s)
Complement EACH of the characters in this string.
void setToBogus()
Make this UnicodeSet object invalid.
UnicodeSet & applyPropertyAlias(const UnicodeString &prop, const UnicodeString &value, UErrorCode &ec)
Modifies this set to contain those code points which have the given value for the given property.
virtual UnicodeSet & removeAll(const UnicodeSet &c)
Removes from this set all of its elements that are contained in the specified set.
virtual UnicodeSet & remove(UChar32 start, UChar32 end)
Removes the specified range from this set if it is present.
UnicodeSet & applyPattern(const UnicodeString &pattern, UErrorCode &status)
Modifies this set to represent the set specified by the given pattern, ignoring Unicode Pattern_White...
virtual UnicodeSet & complement(void)
Inverts this set.
virtual UChar32 getRangeStart(int32_t index) const
Iteration method that returns the first character in the specified range of this set.
UnicodeSet & add(const UnicodeString &s)
Adds the specified multicharacter to this set if it is not already present.
UnicodeSet & applyPattern(const UnicodeString &pattern, uint32_t options, const SymbolTable *symbols, UErrorCode &status)
Modifies this set to represent the set specified by the given pattern, optionally ignoring Unicode Pa...
virtual UnicodeSet & clear(void)
Removes all of the elements from this set.
UnicodeSet * cloneAsThawed() const
Clone the set and make the clone mutable.
int32_t indexOf(UChar32 c) const
Returns the index of the given character within this set, where the set is ordered by ascending code ...
UnicodeSet & closeOver(int32_t attribute)
Close this set over the given attribute.
UnicodeSet(const UnicodeString &pattern, ParsePosition &pos, uint32_t options, const SymbolTable *symbols, UErrorCode &status)
Constructs a set from the given pattern.
virtual int32_t hashCode(void) const
Returns the hash code value for this set.
virtual UnicodeSet & complementAll(const UnicodeSet &c)
Complements in this set all elements contained in the specified set.
virtual UMatchDegree matches(const Replaceable &text, int32_t &offset, int32_t limit, UBool incremental)
Implement UnicodeMatcher::matches()
UnicodeSet & complement(const UnicodeString &s)
Complement the specified string in this set.
virtual UBool containsAll(const UnicodeSet &c) const
Returns true if this set contains all the characters and strings of the given set.
UBool containsNone(const UnicodeString &s) const
Returns true if this set contains none of the characters of the given string.
UnicodeSet(const UnicodeSet &o)
Constructs a set that is identical to the given UnicodeSet.
static UBool resemblesPattern(const UnicodeString &pattern, int32_t pos)
Return true if the given position, in the given pattern, appears to be the start of a UnicodeSet patt...
virtual UnicodeSet * clone() const
Returns a copy of this object.
virtual ~UnicodeSet()
Destructs the set.
UnicodeSet & retain(UChar32 c)
Retain the specified character from this set if it is present.
virtual UnicodeSet & compact()
Reallocate this objects internal structures to take up the least possible space, without changing thi...
int32_t span(const char16_t *s, int32_t length, USetSpanCondition spanCondition) const
Returns the length of the initial substring of the input string which consists only of characters and...
UnicodeSet & retainAll(const UnicodeString &s)
Retains EACH of the characters in this string.
virtual UnicodeSet & removeAllStrings()
Remove all strings from this set.
UChar32 charAt(int32_t index) const
Returns the character at the given index within this set, where the set is ordered by ascending code ...
virtual void addMatchSetTo(UnicodeSet &toUnionTo) const
Implementation of UnicodeMatcher API.
virtual UClassID getDynamicClassID(void) const
Implement UnicodeFunctor API.
virtual int32_t getRangeCount(void) const
Iteration method that returns the number of ranges contained in this set.
UBool containsNone(UChar32 start, UChar32 end) const
Returns true if this set contains none of the characters of the given range.
virtual UBool contains(UChar32 start, UChar32 end) const
Returns true if this set contains every character of the given range.
UBool containsAll(const UnicodeString &s) const
Returns true if this set contains all the characters of the given string.
UnicodeSet & removeAll(const UnicodeString &s)
Remove EACH of the characters in this string.
virtual int32_t size(void) const
Returns the number of elements in this set (its cardinality).
int32_t serialize(uint16_t *dest, int32_t destCapacity, UErrorCode &ec) const
Serializes this set into an array of 16-bit integers.
UnicodeSet & set(UChar32 start, UChar32 end)
Make this object represent the range start - end.
UBool contains(const UnicodeString &s) const
Returns true if this set contains the given multicharacter string.
virtual UBool contains(UChar32 c) const
Returns true if this set contains the given character.
virtual UnicodeSet & complement(UChar32 start, UChar32 end)
Complements the specified range in this set.
static UnicodeSet * createFrom(const UnicodeString &s)
Makes a set from a multicharacter string.
static UClassID getStaticClassID(void)
Return the class ID for this class.
virtual UBool isEmpty(void) const
Returns true if this set contains no elements.
UnicodeSet(const UnicodeString &pattern, UErrorCode &status)
Constructs a set from the given pattern.
UnicodeSet & remove(UChar32 c)
Removes the specified character from this set if it is present.
UnicodeSet & add(UChar32 c)
Adds the specified character to this set if it is not already present.
UnicodeSet & applyIntPropertyValue(UProperty prop, int32_t value, UErrorCode &ec)
Modifies this set to contain those code points which have the given value for the given binary or enu...
int32_t spanBack(const char16_t *s, int32_t length, USetSpanCondition spanCondition) const
Returns the start of the trailing substring of the input string which consists only of characters and...
UnicodeSet & remove(const UnicodeString &s)
Removes the specified string from this set if it is present.
virtual UnicodeSet & add(UChar32 start, UChar32 end)
Adds the specified range to this set if it is not already present.
virtual UBool operator==(const UnicodeSet &o) const
Compares the specified object with this set for equality.
UnicodeSet * freeze()
Freeze the set (make it immutable).
UnicodeSet & applyPattern(const UnicodeString &pattern, ParsePosition &pos, uint32_t options, const SymbolTable *symbols, UErrorCode &status)
Parses the given pattern, starting at the given position.
UnicodeSet & retain(const UnicodeString &s)
Retains only the specified string from this set if it is present.
UnicodeSet(const uint16_t buffer[], int32_t bufferLen, ESerialization serialization, UErrorCode &status)
Constructs a set from the output of serialize().
virtual UnicodeSet & retain(UChar32 start, UChar32 end)
Retain only the elements in this set that are contained in the specified range.
virtual UnicodeString & toPattern(UnicodeString &result, UBool escapeUnprintable=false) const
Returns a string representation of this set.
virtual UnicodeSet & addAll(const UnicodeSet &c)
Adds all of the elements in the specified set to this set if they're not already present.
UnicodeSet(const UnicodeString &pattern, uint32_t options, const SymbolTable *symbols, UErrorCode &status)
Constructs a set from the given pattern.
virtual UnicodeSet & retainAll(const UnicodeSet &c)
Retains only the elements in this set that are contained in the specified set.
UnicodeSet & complement(UChar32 c)
Complements the specified character in this set.
UBool containsNone(const UnicodeSet &c) const
Returns true if this set contains none of the characters and strings of the given set.
UnicodeString is a string class that stores Unicode characters directly and provides similar function...
Definition unistr.h:296
int32_t length(void) const
Return the length of the UnicodeString object.
Definition unistr.h:3890
char16_t * getBuffer(int32_t minCapacity)
Get a read/write pointer to the internal buffer.
U_EXPORT UBool operator==(const StringPiece &x, const StringPiece &y)
Global operator == for StringPiece.
UMatchDegree
Constants returned by UnicodeMatcher::matches() indicating the degree of match.
Definition unimatch.h:33
UBool operator!=(const StringPiece &x, const StringPiece &y)
Global operator != for StringPiece.
UProperty
Selection constants for Unicode properties.
Definition uchar.h:195
This file defines an abstract map from Unicode code points to integer values.
uint32_t UCPMapValueFilter(const void *context, uint32_t value)
Callback function type: Modifies a map value.
Definition ucpmap.h:114
struct UCPMap UCPMap
Abstract map from Unicode code points (U+0000..U+10FFFF) to integer values.
Definition ucpmap.h:31
int32_t UChar32
Define UChar32 as a type for single Unicode code points.
Definition umachine.h:467
int8_t UBool
The ICU boolean type, a signed-byte integer.
Definition umachine.h:269
#define U_FINAL
Defined to the C++11 "final" keyword if available.
Definition umachine.h:141
C++ API: Unicode Filter.
C++ API: Unicode String.
void * UClassID
UClassID is used to identify classes without using the compiler's RTTI.
Definition uobject.h:96
C API: Unicode Set.
USetSpanCondition
Argument values for whether span() and similar functions continue while the current character is cont...
Definition uset.h:159
struct USet USet
USet is the C API type corresponding to C++ class UnicodeSet.
Definition uset.h:50
Basic definitions for ICU, for both C and C++ APIs.
#define NULL
Define NULL if necessary, to nullptr for C++ and to ((void *)0) for C.
Definition utypes.h:188
UErrorCode
Standard ICU4C error code type, a substitute for exceptions.
Definition utypes.h:415
#define U_COMMON_API
Set to export library symbols from inside the common library, and to import them from outside.
Definition utypes.h:300