ICU 62.1 62.1
stringtriebuilder.h
Go to the documentation of this file.
1// © 2016 and later: Unicode, Inc. and others.
2// License & terms of use: http://www.unicode.org/copyright.html
3/*
4*******************************************************************************
5* Copyright (C) 2010-2012,2014, International Business Machines
6* Corporation and others. All Rights Reserved.
7*******************************************************************************
8* file name: stringtriebuilder.h
9* encoding: UTF-8
10* tab size: 8 (not used)
11* indentation:4
12*
13* created on: 2010dec24
14* created by: Markus W. Scherer
15*/
16
17#ifndef __STRINGTRIEBUILDER_H__
18#define __STRINGTRIEBUILDER_H__
19
20#include "unicode/utypes.h"
21#include "unicode/uobject.h"
22
28// Forward declaration.
29struct UHashtable;
30typedef struct UHashtable UHashtable;
31
54
56
64public:
65#ifndef U_HIDE_INTERNAL_API
67 static UBool hashNode(const void *node);
69 static UBool equalNodes(const void *left, const void *right);
70#endif /* U_HIDE_INTERNAL_API */
71
72protected:
73 // Do not enclose the protected default constructor with #ifndef U_HIDE_INTERNAL_API
74 // or else the compiler will create a public default constructor.
79
80#ifndef U_HIDE_INTERNAL_API
82 void createCompactBuilder(int32_t sizeGuess, UErrorCode &errorCode);
85
87 void build(UStringTrieBuildOption buildOption, int32_t elementsLength, UErrorCode &errorCode);
88
90 int32_t writeNode(int32_t start, int32_t limit, int32_t unitIndex);
92 int32_t writeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex, int32_t length);
93#endif /* U_HIDE_INTERNAL_API */
94
95 class Node;
96
97#ifndef U_HIDE_INTERNAL_API
99 Node *makeNode(int32_t start, int32_t limit, int32_t unitIndex, UErrorCode &errorCode);
101 Node *makeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex,
102 int32_t length, UErrorCode &errorCode);
103#endif /* U_HIDE_INTERNAL_API */
104
106 virtual int32_t getElementStringLength(int32_t i) const = 0;
108 virtual char16_t getElementUnit(int32_t i, int32_t unitIndex) const = 0;
110 virtual int32_t getElementValue(int32_t i) const = 0;
111
112 // Finds the first unit index after this one where
113 // the first and last element have different units again.
115 virtual int32_t getLimitOfLinearMatch(int32_t first, int32_t last, int32_t unitIndex) const = 0;
116
117 // Number of different units at unitIndex.
119 virtual int32_t countElementUnits(int32_t start, int32_t limit, int32_t unitIndex) const = 0;
121 virtual int32_t skipElementsBySomeUnits(int32_t i, int32_t unitIndex, int32_t count) const = 0;
123 virtual int32_t indexOfElementWithNextUnit(int32_t i, int32_t unitIndex, char16_t unit) const = 0;
124
126 virtual UBool matchNodesCanHaveValues() const = 0;
127
129 virtual int32_t getMaxBranchLinearSubNodeLength() const = 0;
131 virtual int32_t getMinLinearMatch() const = 0;
133 virtual int32_t getMaxLinearMatchLength() const = 0;
134
135#ifndef U_HIDE_INTERNAL_API
136 // max(BytesTrie::kMaxBranchLinearSubNodeLength, UCharsTrie::kMaxBranchLinearSubNodeLength).
138 static const int32_t kMaxBranchLinearSubNodeLength=5;
139
140 // Maximum number of nested split-branch levels for a branch on all 2^16 possible char16_t units.
141 // log2(2^16/kMaxBranchLinearSubNodeLength) rounded up.
143 static const int32_t kMaxSplitBranchLevels=14;
144
166 Node *registerFinalValue(int32_t value, UErrorCode &errorCode);
167#endif /* U_HIDE_INTERNAL_API */
168
169 /*
170 * C++ note:
171 * registerNode() and registerFinalValue() take ownership of their input nodes,
172 * and only return owned nodes.
173 * If they see a failure UErrorCode, they will delete the input node.
174 * If they get a NULL pointer, they will record a U_MEMORY_ALLOCATION_ERROR.
175 * If there is a failure, they return NULL.
176 *
177 * NULL Node pointers can be safely passed into other Nodes because
178 * they call the static Node::hashCode() which checks for a NULL pointer first.
179 *
180 * Therefore, as long as builder functions register a new node,
181 * they need to check for failures only before explicitly dereferencing
182 * a Node pointer, or before setting a new UErrorCode.
183 */
184
185 // Hash set of nodes, maps from nodes to integer 1.
187 UHashtable *nodes;
188
189 // Do not conditionalize the following with #ifndef U_HIDE_INTERNAL_API,
190 // it is needed for layout of other objects.
192 class Node : public UObject {
193 public:
194 Node(int32_t initialHash) : hash(initialHash), offset(0) {}
195 inline int32_t hashCode() const { return hash; }
196 // Handles node==NULL.
197 static inline int32_t hashCode(const Node *node) { return node==NULL ? 0 : node->hashCode(); }
198 // Base class operator==() compares the actual class types.
199 virtual UBool operator==(const Node &other) const;
200 inline UBool operator!=(const Node &other) const { return !operator==(other); }
228 virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
229 // write() must set the offset to a positive value.
230 virtual void write(StringTrieBuilder &builder) = 0;
231 // See markRightEdgesFirst.
232 inline void writeUnlessInsideRightEdge(int32_t firstRight, int32_t lastRight,
234 // Note: Edge numbers are negative, lastRight<=firstRight.
235 // If offset>0 then this node and its sub-nodes have been written already
236 // and we need not write them again.
237 // If this node is part of the unwritten right branch edge,
238 // then we wait until that is written.
239 if(offset<0 && (offset<lastRight || firstRight<offset)) {
240 write(builder);
241 }
242 }
243 inline int32_t getOffset() const { return offset; }
244 protected:
245 int32_t hash;
246 int32_t offset;
247 };
248
249#ifndef U_HIDE_INTERNAL_API
250 // This class should not be overridden because
251 // registerFinalValue() compares a stack-allocated FinalValueNode
252 // (stack-allocated so that we don't unnecessarily create lots of duplicate nodes)
253 // with the input node, and the
254 // !Node::operator==(other) used inside FinalValueNode::operator==(other)
255 // will be false if the typeid's are different.
257 class FinalValueNode : public Node {
258 public:
259 FinalValueNode(int32_t v) : Node(0x111111u*37u+v), value(v) {}
260 virtual UBool operator==(const Node &other) const;
261 virtual void write(StringTrieBuilder &builder);
262 protected:
263 int32_t value;
264 };
265#endif /* U_HIDE_INTERNAL_API */
266
267 // Do not conditionalize the following with #ifndef U_HIDE_INTERNAL_API,
268 // it is needed for layout of other objects.
272 class ValueNode : public Node {
273 public:
274 ValueNode(int32_t initialHash) : Node(initialHash), hasValue(FALSE), value(0) {}
275 virtual UBool operator==(const Node &other) const;
276 void setValue(int32_t v) {
277 hasValue=TRUE;
278 value=v;
279 hash=hash*37u+v;
280 }
281 protected:
282 UBool hasValue;
283 int32_t value;
284 };
285
286#ifndef U_HIDE_INTERNAL_API
291 public:
293 : ValueNode(0x222222u*37u+hashCode(nextNode)), next(nextNode) { setValue(v); }
294 virtual UBool operator==(const Node &other) const;
295 virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
296 virtual void write(StringTrieBuilder &builder);
297 protected:
298 Node *next;
299 };
300#endif /* U_HIDE_INTERNAL_API */
301
302 // Do not conditionalize the following with #ifndef U_HIDE_INTERNAL_API,
303 // it is needed for layout of other objects.
307 class LinearMatchNode : public ValueNode {
308 public:
309 LinearMatchNode(int32_t len, Node *nextNode)
310 : ValueNode((0x333333u*37u+len)*37u+hashCode(nextNode)),
311 length(len), next(nextNode) {}
312 virtual UBool operator==(const Node &other) const;
313 virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
314 protected:
315 int32_t length;
316 Node *next;
317 };
318
319#ifndef U_HIDE_INTERNAL_API
323 class BranchNode : public Node {
324 public:
326 protected:
327 int32_t firstEdgeNumber;
328 };
329
333 class ListBranchNode : public BranchNode {
334 public:
335 ListBranchNode() : BranchNode(0x444444), length(0) {}
336 virtual UBool operator==(const Node &other) const;
337 virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
338 virtual void write(StringTrieBuilder &builder);
339 // Adds a unit with a final value.
340 void add(int32_t c, int32_t value) {
341 units[length]=(char16_t)c;
342 equal[length]=NULL;
343 values[length]=value;
344 ++length;
345 hash=(hash*37u+c)*37u+value;
346 }
347 // Adds a unit which leads to another match node.
348 void add(int32_t c, Node *node) {
349 units[length]=(char16_t)c;
350 equal[length]=node;
351 values[length]=0;
352 ++length;
353 hash=(hash*37u+c)*37u+hashCode(node);
354 }
355 protected:
356 Node *equal[kMaxBranchLinearSubNodeLength]; // NULL means "has final value".
357 int32_t length;
358 int32_t values[kMaxBranchLinearSubNodeLength];
359 char16_t units[kMaxBranchLinearSubNodeLength];
360 };
361
366 public:
368 : BranchNode(((0x555555u*37u+middleUnit)*37u+
369 hashCode(lessThanNode))*37u+hashCode(greaterOrEqualNode)),
370 unit(middleUnit), lessThan(lessThanNode), greaterOrEqual(greaterOrEqualNode) {}
371 virtual UBool operator==(const Node &other) const;
372 virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
373 virtual void write(StringTrieBuilder &builder);
374 protected:
375 char16_t unit;
376 Node *lessThan;
377 Node *greaterOrEqual;
378 };
379
380 // Branch head node, for writing the actual node lead unit.
382 class BranchHeadNode : public ValueNode {
383 public:
384 BranchHeadNode(int32_t len, Node *subNode)
385 : ValueNode((0x666666u*37u+len)*37u+hashCode(subNode)),
386 length(len), next(subNode) {}
387 virtual UBool operator==(const Node &other) const;
388 virtual int32_t markRightEdgesFirst(int32_t edgeNumber);
389 virtual void write(StringTrieBuilder &builder);
390 protected:
391 int32_t length;
392 Node *next; // A branch sub-node.
393 };
394#endif /* U_HIDE_INTERNAL_API */
395
397 virtual Node *createLinearMatchNode(int32_t i, int32_t unitIndex, int32_t length,
398 Node *nextNode) const = 0;
399
401 virtual int32_t write(int32_t unit) = 0;
403 virtual int32_t writeElementUnits(int32_t i, int32_t unitIndex, int32_t length) = 0;
405 virtual int32_t writeValueAndFinal(int32_t i, UBool isFinal) = 0;
407 virtual int32_t writeValueAndType(UBool hasValue, int32_t value, int32_t node) = 0;
409 virtual int32_t writeDeltaTo(int32_t jumpTarget) = 0;
410};
411
413
414#endif // __STRINGTRIEBUILDER_H__
"Smart pointer" base class; do not use directly: use LocalPointer etc.
virtual int32_t markRightEdgesFirst(int32_t edgeNumber)
Traverses the Node graph and numbers branch edges, with rightmost edges first.
virtual int32_t markRightEdgesFirst(int32_t edgeNumber)
Traverses the Node graph and numbers branch edges, with rightmost edges first.
virtual int32_t markRightEdgesFirst(int32_t edgeNumber)
Traverses the Node graph and numbers branch edges, with rightmost edges first.
virtual int32_t markRightEdgesFirst(int32_t edgeNumber)
Traverses the Node graph and numbers branch edges, with rightmost edges first.
virtual int32_t markRightEdgesFirst(int32_t edgeNumber)
Traverses the Node graph and numbers branch edges, with rightmost edges first.
virtual int32_t markRightEdgesFirst(int32_t edgeNumber)
Traverses the Node graph and numbers branch edges, with rightmost edges first.
Base class for string trie builder classes.
static UBool equalNodes(const void *left, const void *right)
virtual int32_t writeValueAndFinal(int32_t i, UBool isFinal)=0
virtual int32_t getMaxBranchLinearSubNodeLength() const =0
Node * registerFinalValue(int32_t value, UErrorCode &errorCode)
Makes sure that there is only one unique FinalValueNode registered with this value.
virtual int32_t countElementUnits(int32_t start, int32_t limit, int32_t unitIndex) const =0
virtual ~StringTrieBuilder()
virtual int32_t writeDeltaTo(int32_t jumpTarget)=0
Node * registerNode(Node *newNode, UErrorCode &errorCode)
Makes sure that there is only one unique node registered that is equivalent to newNode.
int32_t writeNode(int32_t start, int32_t limit, int32_t unitIndex)
Node * makeNode(int32_t start, int32_t limit, int32_t unitIndex, UErrorCode &errorCode)
virtual int32_t getLimitOfLinearMatch(int32_t first, int32_t last, int32_t unitIndex) const =0
Node * makeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex, int32_t length, UErrorCode &errorCode)
int32_t writeBranchSubNode(int32_t start, int32_t limit, int32_t unitIndex, int32_t length)
virtual int32_t getElementValue(int32_t i) const =0
virtual int32_t getMinLinearMatch() const =0
virtual int32_t skipElementsBySomeUnits(int32_t i, int32_t unitIndex, int32_t count) const =0
void build(UStringTrieBuildOption buildOption, int32_t elementsLength, UErrorCode &errorCode)
virtual int32_t writeValueAndType(UBool hasValue, int32_t value, int32_t node)=0
virtual int32_t indexOfElementWithNextUnit(int32_t i, int32_t unitIndex, char16_t unit) const =0
virtual UBool matchNodesCanHaveValues() const =0
virtual int32_t write(int32_t unit)=0
virtual char16_t getElementUnit(int32_t i, int32_t unitIndex) const =0
static UBool hashNode(const void *node)
virtual Node * createLinearMatchNode(int32_t i, int32_t unitIndex, int32_t length, Node *nextNode) const =0
virtual int32_t getMaxLinearMatchLength() const =0
virtual int32_t getElementStringLength(int32_t i) const =0
void createCompactBuilder(int32_t sizeGuess, UErrorCode &errorCode)
virtual int32_t writeElementUnits(int32_t i, int32_t unitIndex, int32_t length)=0
UObject is the common ICU "boilerplate" class.
Definition uobject.h:223
U_EXPORT UBool operator==(const StringPiece &x, const StringPiece &y)
Global operator == for StringPiece.
UBool operator!=(const StringPiece &x, const StringPiece &y)
Global operator != for StringPiece.
UStringTrieBuildOption
Build options for BytesTrieBuilder and CharsTrieBuilder.
@ USTRINGTRIE_BUILD_SMALL
Builds a trie more slowly, attempting to generate a shorter but equivalent serialization.
@ USTRINGTRIE_BUILD_FAST
Builds a trie quickly.
int8_t UBool
The ICU boolean type.
Definition umachine.h:236
#define TRUE
The TRUE value of a UBool.
Definition umachine.h:240
#define FALSE
The FALSE value of a UBool.
Definition umachine.h:244
C++ API: Common ICU base class UObject.
Basic definitions for ICU, for both C and C++ APIs.
#define NULL
Define NULL if necessary, to nullptr for C++ and to ((void *)0) for C.
Definition utypes.h:188
UErrorCode
Error code to replace exception handling, so that the code is compatible with all C++ compilers,...
Definition utypes.h:396
#define U_COMMON_API
Set to export library symbols from inside the common library, and to import them from outside.
Definition utypes.h:359
#define U_NAMESPACE_END
This is used to end a declaration of a public ICU C++ API.
Definition uversion.h:138
#define U_NAMESPACE_BEGIN
This is used to begin a declaration of a public ICU C++ API.
Definition uversion.h:137