ICU 62.1 62.1
uniset.h
Go to the documentation of this file.
1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4***************************************************************************
5* Copyright (C) 1999-2016, International Business Machines Corporation
6* and others. All Rights Reserved.
7***************************************************************************
8* Date Name Description
9* 10/20/99 alan Creation.
10***************************************************************************
11*/
12
13#ifndef UNICODESET_H
14#define UNICODESET_H
15
16#include "unicode/unifilt.h"
17#include "unicode/unistr.h"
18#include "unicode/uset.h"
19
26
27// Forward Declarations.
30class BMPSet;
31class ParsePosition;
32class RBBIRuleScanner;
33class SymbolTable;
35class UVector;
37
279
280 int32_t len; // length of list used; 0 <= len <= capacity
281 int32_t capacity; // capacity of list
282 UChar32* list; // MUST be terminated with HIGH
283 BMPSet *bmpSet; // The set is frozen iff either bmpSet or stringSpan is not NULL.
284 UChar32* buffer; // internal buffer, may be NULL
285 int32_t bufferCapacity; // capacity of buffer
286 int32_t patLen;
287
297 char16_t *pat;
298 UVector* strings; // maintained in sorted order
299 UnicodeSetStringSpan *stringSpan;
300
301private:
302 enum { // constants
303 kIsBogus = 1 // This set is bogus (i.e. not valid)
304 };
305 uint8_t fFlags; // Bit flag (see constants above)
306public:
316 inline UBool isBogus(void) const;
317
335
336public:
337
338 enum {
343 MIN_VALUE = 0,
344
349 MAX_VALUE = 0x10ffff
350 };
351
352 //----------------------------------------------------------------
353 // Constructors &c
354 //----------------------------------------------------------------
355
356public:
357
363
373
374#ifndef U_HIDE_INTERNAL_API
379 kSerialized /* result of serialize() */
380 };
381
392 UnicodeSet(const uint16_t buffer[], int32_t bufferLen,
394#endif /* U_HIDE_INTERNAL_API */
395
404 UnicodeSet(const UnicodeString& pattern,
406
407#ifndef U_HIDE_INTERNAL_API
420 UnicodeSet(const UnicodeString& pattern,
421 uint32_t options,
422 const SymbolTable* symbols,
424#endif /* U_HIDE_INTERNAL_API */
425
440 uint32_t options,
441 const SymbolTable* symbols,
443
449
454 virtual ~UnicodeSet();
455
462
474 virtual UBool operator==(const UnicodeSet& o) const;
475
481 UBool operator!=(const UnicodeSet& o) const;
482
492 virtual UnicodeFunctor* clone() const;
493
501 virtual int32_t hashCode(void) const;
502
511 inline static UnicodeSet *fromUSet(USet *uset);
512
521 inline static const UnicodeSet *fromUSet(const USet *uset);
522
530 inline USet *toUSet();
531
532
540 inline const USet * toUSet() const;
541
542
543 //----------------------------------------------------------------
544 // Freezable API
545 //----------------------------------------------------------------
546
555 inline UBool isFrozen() const;
556
571
581
582 //----------------------------------------------------------------
583 // Public API
584 //----------------------------------------------------------------
585
597
603 static UBool resemblesPattern(const UnicodeString& pattern,
604 int32_t pos);
605
620
621#ifndef U_HIDE_INTERNAL_API
639 uint32_t options,
640 const SymbolTable* symbols,
642#endif /* U_HIDE_INTERNAL_API */
643
676 ParsePosition& pos,
677 uint32_t options,
678 const SymbolTable* symbols,
680
696
720 int32_t value,
721 UErrorCode& ec);
722
753 const UnicodeString& value,
754 UErrorCode& ec);
755
764 virtual int32_t size(void) const;
765
772 virtual UBool isEmpty(void) const;
773
781 virtual UBool contains(UChar32 c) const;
782
791 virtual UBool contains(UChar32 start, UChar32 end) const;
792
800 UBool contains(const UnicodeString& s) const;
801
809 virtual UBool containsAll(const UnicodeSet& c) const;
810
819
829
838
847
856 inline UBool containsSome(UChar32 start, UChar32 end) const;
857
865 inline UBool containsSome(const UnicodeSet& s) const;
866
874 inline UBool containsSome(const UnicodeString& s) const;
875
894 int32_t span(const char16_t *s, int32_t length, USetSpanCondition spanCondition) const;
895
908 inline int32_t span(const UnicodeString &s, int32_t start, USetSpanCondition spanCondition) const;
909
927 int32_t spanBack(const char16_t *s, int32_t length, USetSpanCondition spanCondition) const;
928
942 inline int32_t spanBack(const UnicodeString &s, int32_t limit, USetSpanCondition spanCondition) const;
943
962 int32_t spanUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const;
963
981 int32_t spanBackUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const;
982
987 virtual UMatchDegree matches(const Replaceable& text,
988 int32_t& offset,
989 int32_t limit,
991
992private:
1015 static int32_t matchRest(const Replaceable& text,
1016 int32_t start, int32_t limit,
1017 const UnicodeString& s);
1018
1028 int32_t findCodePoint(UChar32 c) const;
1029
1030public:
1031
1039 virtual void addMatchSetTo(UnicodeSet& toUnionTo) const;
1040
1049 int32_t indexOf(UChar32 c) const;
1050
1060 UChar32 charAt(int32_t index) const;
1061
1076 virtual UnicodeSet& add(UChar32 start, UChar32 end);
1077
1086
1099
1100 private:
1106 static int32_t getSingleCP(const UnicodeString& s);
1107
1108 void _add(const UnicodeString& s);
1109
1110 public:
1120
1130
1140
1150
1160
1161
1170
1184 virtual UnicodeSet& retain(UChar32 start, UChar32 end);
1185
1186
1193
1207 virtual UnicodeSet& remove(UChar32 start, UChar32 end);
1208
1217
1228
1236 virtual UnicodeSet& complement(void);
1237
1252 virtual UnicodeSet& complement(UChar32 start, UChar32 end);
1253
1262
1274
1287 virtual UnicodeSet& addAll(const UnicodeSet& c);
1288
1300 virtual UnicodeSet& retainAll(const UnicodeSet& c);
1301
1313 virtual UnicodeSet& removeAll(const UnicodeSet& c);
1314
1326
1333 virtual UnicodeSet& clear(void);
1334
1361
1369
1377 virtual int32_t getRangeCount(void) const;
1378
1386 virtual UChar32 getRangeStart(int32_t index) const;
1387
1395 virtual UChar32 getRangeEnd(int32_t index) const;
1396
1445 int32_t serialize(uint16_t *dest, int32_t destCapacity, UErrorCode& ec) const;
1446
1454
1467
1476 virtual UClassID getDynamicClassID(void) const;
1477
1478private:
1479
1480 // Private API for the USet API
1481
1482 friend class USetAccess;
1483
1484 int32_t getStringCount() const;
1485
1486 const UnicodeString* getString(int32_t index) const;
1487
1488 //----------------------------------------------------------------
1489 // RuleBasedTransliterator support
1490 //----------------------------------------------------------------
1491
1492private:
1493
1499 virtual UBool matchesIndexValue(uint8_t v) const;
1500
1501private:
1502 friend class RBBIRuleScanner;
1503
1504 //----------------------------------------------------------------
1505 // Implementation: Clone as thawed (see ICU4J Freezable)
1506 //----------------------------------------------------------------
1507
1508 UnicodeSet(const UnicodeSet& o, UBool /* asThawed */);
1509
1510 //----------------------------------------------------------------
1511 // Implementation: Pattern parsing
1512 //----------------------------------------------------------------
1513
1514 void applyPatternIgnoreSpace(const UnicodeString& pattern,
1515 ParsePosition& pos,
1516 const SymbolTable* symbols,
1518
1519 void applyPattern(RuleCharacterIterator& chars,
1520 const SymbolTable* symbols,
1522 uint32_t options,
1524 int32_t depth,
1525 UErrorCode& ec);
1526
1527 //----------------------------------------------------------------
1528 // Implementation: Utility methods
1529 //----------------------------------------------------------------
1530
1531 void ensureCapacity(int32_t newLen, UErrorCode& ec);
1532
1533 void ensureBufferCapacity(int32_t newLen, UErrorCode& ec);
1534
1535 void swapBuffers(void);
1536
1537 UBool allocateStrings(UErrorCode &status);
1538
1539 UnicodeString& _toPattern(UnicodeString& result,
1540 UBool escapeUnprintable) const;
1541
1542 UnicodeString& _generatePattern(UnicodeString& result,
1543 UBool escapeUnprintable) const;
1544
1545 static void _appendToPat(UnicodeString& buf, const UnicodeString& s, UBool escapeUnprintable);
1546
1547 static void _appendToPat(UnicodeString& buf, UChar32 c, UBool escapeUnprintable);
1548
1549 //----------------------------------------------------------------
1550 // Implementation: Fundamental operators
1551 //----------------------------------------------------------------
1552
1553 void exclusiveOr(const UChar32* other, int32_t otherLen, int8_t polarity);
1554
1555 void add(const UChar32* other, int32_t otherLen, int8_t polarity);
1556
1557 void retain(const UChar32* other, int32_t otherLen, int8_t polarity);
1558
1564 static UBool resemblesPropertyPattern(const UnicodeString& pattern,
1565 int32_t pos);
1566
1567 static UBool resemblesPropertyPattern(RuleCharacterIterator& chars,
1568 int32_t iterOpts);
1569
1609 UnicodeSet& applyPropertyPattern(const UnicodeString& pattern,
1611 UErrorCode &ec);
1612
1613 void applyPropertyPattern(RuleCharacterIterator& chars,
1615 UErrorCode& ec);
1616
1618 static const UnicodeSet* getInclusions(int32_t src, UErrorCode &status);
1619
1624 typedef UBool (*Filter)(UChar32 codePoint, void* context);
1625
1635 void applyFilter(Filter filter,
1636 void* context,
1637 int32_t src,
1639
1643 void setPattern(const UnicodeString& newPat);
1647 void releasePattern();
1648
1649 friend class UnicodeSetIterator;
1650};
1651
1652
1653
1654inline UBool UnicodeSet::operator!=(const UnicodeSet& o) const {
1655 return !operator==(o);
1656}
1657
1658inline UBool UnicodeSet::isFrozen() const {
1659 return (UBool)(bmpSet!=NULL || stringSpan!=NULL);
1660}
1661
1662inline UBool UnicodeSet::containsSome(UChar32 start, UChar32 end) const {
1663 return !containsNone(start, end);
1664}
1665
1666inline UBool UnicodeSet::containsSome(const UnicodeSet& s) const {
1667 return !containsNone(s);
1668}
1669
1670inline UBool UnicodeSet::containsSome(const UnicodeString& s) const {
1671 return !containsNone(s);
1672}
1673
1674inline UBool UnicodeSet::isBogus() const {
1675 return (UBool)(fFlags & kIsBogus);
1676}
1677
1678inline UnicodeSet *UnicodeSet::fromUSet(USet *uset) {
1679 return reinterpret_cast<UnicodeSet *>(uset);
1680}
1681
1682inline const UnicodeSet *UnicodeSet::fromUSet(const USet *uset) {
1683 return reinterpret_cast<const UnicodeSet *>(uset);
1684}
1685
1686inline USet *UnicodeSet::toUSet() {
1687 return reinterpret_cast<USet *>(this);
1688}
1689
1690inline const USet *UnicodeSet::toUSet() const {
1691 return reinterpret_cast<const USet *>(this);
1692}
1693
1694inline int32_t UnicodeSet::span(const UnicodeString &s, int32_t start, USetSpanCondition spanCondition) const {
1695 int32_t sLength=s.length();
1696 if(start<0) {
1697 start=0;
1698 } else if(start>sLength) {
1699 start=sLength;
1700 }
1701 return start+span(s.getBuffer()+start, sLength-start, spanCondition);
1702}
1703
1704inline int32_t UnicodeSet::spanBack(const UnicodeString &s, int32_t limit, USetSpanCondition spanCondition) const {
1705 int32_t sLength=s.length();
1706 if(limit<0) {
1707 limit=0;
1708 } else if(limit>sLength) {
1709 limit=sLength;
1710 }
1711 return spanBack(s.getBuffer(), limit, spanCondition);
1712}
1713
1715
1716#endif
"Smart pointer" base class; do not use directly: use LocalPointer etc.
ParsePosition is a simple class used by Format and its subclasses to keep track of the current positi...
Definition parsepos.h:49
Replaceable is an abstract base class representing a string of characters that supports the replaceme...
Definition rep.h:73
An interface that defines both lookup protocol and parsing of symbolic names.
Definition symtable.h:56
UnicodeFilter defines a protocol for selecting a subset of the full range (U+0000 to U+10FFFF) of Uni...
Definition unifilt.h:61
UnicodeFunctor is an abstract base class for objects that perform match and/or replace operations on ...
Definition unifunct.h:35
virtual UBool matchesIndexValue(uint8_t v) const =0
Returns TRUE if this matcher will match a character c, where c & 0xFF == v, at offset,...
UnicodeSetIterator iterates over the contents of a UnicodeSet.
Definition usetiter.h:63
A mutable set of Unicode characters and multicharacter strings.
Definition uniset.h:278
UnicodeFunctor * freeze()
Freeze the set (make it immutable).
UnicodeSet & operator=(const UnicodeSet &o)
Assigns this object to be a copy of another.
UnicodeSet & addAll(const UnicodeString &s)
Adds each of the characters in this string to the set.
virtual UChar32 getRangeEnd(int32_t index) const
Iteration method that returns the last character in the specified range of this set.
UnicodeSet()
Constructs an empty set.
int32_t spanBackUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const
Returns the start of the trailing substring of the input string which consists only of characters and...
UnicodeSet(UChar32 start, UChar32 end)
Constructs a set containing the given range.
static UnicodeSet * createFromAll(const UnicodeString &s)
Makes a set from each of the characters in the string.
int32_t spanUTF8(const char *s, int32_t length, USetSpanCondition spanCondition) const
Returns the length of the initial substring of the input string which consists only of characters and...
UnicodeSet & complementAll(const UnicodeString &s)
Complement EACH of the characters in this string.
void setToBogus()
Make this UnicodeSet object invalid.
UnicodeSet & applyPropertyAlias(const UnicodeString &prop, const UnicodeString &value, UErrorCode &ec)
Modifies this set to contain those code points which have the given value for the given property.
virtual UnicodeSet & removeAll(const UnicodeSet &c)
Removes from this set all of its elements that are contained in the specified set.
virtual UnicodeSet & remove(UChar32 start, UChar32 end)
Removes the specified range from this set if it is present.
UnicodeSet & applyPattern(const UnicodeString &pattern, UErrorCode &status)
Modifies this set to represent the set specified by the given pattern, ignoring Unicode Pattern_White...
virtual UnicodeSet & complement(void)
Inverts this set.
virtual UChar32 getRangeStart(int32_t index) const
Iteration method that returns the first character in the specified range of this set.
UnicodeSet & add(const UnicodeString &s)
Adds the specified multicharacter to this set if it is not already present.
UnicodeSet & applyPattern(const UnicodeString &pattern, uint32_t options, const SymbolTable *symbols, UErrorCode &status)
Modifies this set to represent the set specified by the given pattern, optionally ignoring Unicode Pa...
virtual UnicodeSet & clear(void)
Removes all of the elements from this set.
int32_t indexOf(UChar32 c) const
Returns the index of the given character within this set, where the set is ordered by ascending code ...
UnicodeSet & closeOver(int32_t attribute)
Close this set over the given attribute.
UnicodeSet(const UnicodeString &pattern, ParsePosition &pos, uint32_t options, const SymbolTable *symbols, UErrorCode &status)
Constructs a set from the given pattern.
virtual int32_t hashCode(void) const
Returns the hash code value for this set.
virtual UnicodeSet & complementAll(const UnicodeSet &c)
Complements in this set all elements contained in the specified set.
virtual UMatchDegree matches(const Replaceable &text, int32_t &offset, int32_t limit, UBool incremental)
Implement UnicodeMatcher::matches()
UnicodeSet & complement(const UnicodeString &s)
Complement the specified string in this set.
virtual UBool containsAll(const UnicodeSet &c) const
Returns true if this set contains all the characters and strings of the given set.
UBool containsNone(const UnicodeString &s) const
Returns true if this set contains none of the characters of the given string.
UnicodeSet(const UnicodeSet &o)
Constructs a set that is identical to the given UnicodeSet.
static UBool resemblesPattern(const UnicodeString &pattern, int32_t pos)
Return true if the given position, in the given pattern, appears to be the start of a UnicodeSet patt...
virtual UnicodeFunctor * clone() const
Returns a copy of this object.
virtual ~UnicodeSet()
Destructs the set.
UnicodeSet & retain(UChar32 c)
Retain the specified character from this set if it is present.
virtual UnicodeSet & compact()
Reallocate this objects internal structures to take up the least possible space, without changing thi...
int32_t span(const char16_t *s, int32_t length, USetSpanCondition spanCondition) const
Returns the length of the initial substring of the input string which consists only of characters and...
UnicodeSet & retainAll(const UnicodeString &s)
Retains EACH of the characters in this string.
virtual UnicodeSet & removeAllStrings()
Remove all strings from this set.
UChar32 charAt(int32_t index) const
Returns the character at the given index within this set, where the set is ordered by ascending code ...
virtual void addMatchSetTo(UnicodeSet &toUnionTo) const
Implementation of UnicodeMatcher API.
virtual UClassID getDynamicClassID(void) const
Implement UnicodeFunctor API.
virtual int32_t getRangeCount(void) const
Iteration method that returns the number of ranges contained in this set.
UBool containsNone(UChar32 start, UChar32 end) const
Returns true if this set contains none of the characters of the given range.
virtual UBool contains(UChar32 start, UChar32 end) const
Returns true if this set contains every character of the given range.
UBool containsAll(const UnicodeString &s) const
Returns true if this set contains all the characters of the given string.
UnicodeSet & removeAll(const UnicodeString &s)
Remove EACH of the characters in this string.
virtual int32_t size(void) const
Returns the number of elements in this set (its cardinality).
int32_t serialize(uint16_t *dest, int32_t destCapacity, UErrorCode &ec) const
Serializes this set into an array of 16-bit integers.
UnicodeSet & set(UChar32 start, UChar32 end)
Make this object represent the range start - end.
UBool contains(const UnicodeString &s) const
Returns true if this set contains the given multicharacter string.
virtual UBool contains(UChar32 c) const
Returns true if this set contains the given character.
virtual UnicodeSet & complement(UChar32 start, UChar32 end)
Complements the specified range in this set.
static UnicodeSet * createFrom(const UnicodeString &s)
Makes a set from a multicharacter string.
static UClassID getStaticClassID(void)
Return the class ID for this class.
virtual UBool isEmpty(void) const
Returns true if this set contains no elements.
UnicodeSet(const UnicodeString &pattern, UErrorCode &status)
Constructs a set from the given pattern.
UnicodeSet & remove(UChar32 c)
Removes the specified character from this set if it is present.
UnicodeSet & add(UChar32 c)
Adds the specified character to this set if it is not already present.
UnicodeSet & applyIntPropertyValue(UProperty prop, int32_t value, UErrorCode &ec)
Modifies this set to contain those code points which have the given value for the given binary or enu...
int32_t spanBack(const char16_t *s, int32_t length, USetSpanCondition spanCondition) const
Returns the start of the trailing substring of the input string which consists only of characters and...
UnicodeSet & remove(const UnicodeString &s)
Removes the specified string from this set if it is present.
virtual UnicodeSet & add(UChar32 start, UChar32 end)
Adds the specified range to this set if it is not already present.
virtual UBool operator==(const UnicodeSet &o) const
Compares the specified object with this set for equality.
virtual UnicodeString & toPattern(UnicodeString &result, UBool escapeUnprintable=FALSE) const
Returns a string representation of this set.
UnicodeSet & applyPattern(const UnicodeString &pattern, ParsePosition &pos, uint32_t options, const SymbolTable *symbols, UErrorCode &status)
Parses the given pattern, starting at the given position.
UnicodeFunctor * cloneAsThawed() const
Clone the set and make the clone mutable.
UnicodeSet(const uint16_t buffer[], int32_t bufferLen, ESerialization serialization, UErrorCode &status)
Constructs a set from the output of serialize().
virtual UnicodeSet & retain(UChar32 start, UChar32 end)
Retain only the elements in this set that are contained in the specified range.
friend void UnicodeSet_initInclusion(int32_t src, UErrorCode &status)
virtual UnicodeSet & addAll(const UnicodeSet &c)
Adds all of the elements in the specified set to this set if they're not already present.
UnicodeSet(const UnicodeString &pattern, uint32_t options, const SymbolTable *symbols, UErrorCode &status)
Constructs a set from the given pattern.
virtual UnicodeSet & retainAll(const UnicodeSet &c)
Retains only the elements in this set that are contained in the specified set.
UnicodeSet & complement(UChar32 c)
Complements the specified character in this set.
UBool containsNone(const UnicodeSet &c) const
Returns true if this set contains none of the characters and strings of the given set.
UnicodeString is a string class that stores Unicode characters directly and provides similar function...
Definition unistr.h:287
int32_t length(void) const
Return the length of the UnicodeString object.
Definition unistr.h:3906
char16_t * getBuffer(int32_t minCapacity)
Get a read/write pointer to the internal buffer.
U_EXPORT UBool operator==(const StringPiece &x, const StringPiece &y)
Global operator == for StringPiece.
UMatchDegree
Constants returned by UnicodeMatcher::matches() indicating the degree of match.
Definition unimatch.h:32
UBool operator!=(const StringPiece &x, const StringPiece &y)
Global operator != for StringPiece.
void UnicodeSet_initInclusion(int32_t src, UErrorCode &status)
#define U_CALLCONV
Similar to U_CDECL_BEGIN/U_CDECL_END, this qualifier is necessary in callback function typedefs to ma...
Definition platform.h:835
UProperty
Selection constants for Unicode properties.
Definition uchar.h:165
struct USet USet
Definition ucnv.h:69
int32_t UChar32
Define UChar32 as a type for single Unicode code points.
Definition umachine.h:400
int8_t UBool
The ICU boolean type.
Definition umachine.h:236
#define FALSE
The FALSE value of a UBool.
Definition umachine.h:244
C++ API: Unicode Filter.
C++ API: Unicode String.
void * UClassID
UClassID is used to identify classes without using the compiler's RTTI.
Definition uobject.h:93
C API: Unicode Set.
USetSpanCondition
Argument values for whether span() and similar functions continue while the current character is cont...
Definition uset.h:152
#define NULL
Define NULL if necessary, to nullptr for C++ and to ((void *)0) for C.
Definition utypes.h:188
UErrorCode
Error code to replace exception handling, so that the code is compatible with all C++ compilers,...
Definition utypes.h:396
#define U_COMMON_API
Set to export library symbols from inside the common library, and to import them from outside.
Definition utypes.h:359
#define U_NAMESPACE_END
This is used to end a declaration of a public ICU C++ API.
Definition uversion.h:138
#define U_NAMESPACE_BEGIN
This is used to begin a declaration of a public ICU C++ API.
Definition uversion.h:137