29class OPENCC_EXPORT PhraseExtract {
31 typedef UTF8StringSlice::LengthType LengthType;
37 virtual ~PhraseExtract();
39 void Extract(
const std::string& text) {
43 CalculateSuffixEntropy();
46 CalculatePrefixEntropy();
48 ExtractWordCandidates();
53 void SetFullText(
const std::string& fullText) {
54 utf8FullText = UTF8StringSlice(fullText.c_str());
57 void SetFullText(
const char* fullText) {
58 utf8FullText = UTF8StringSlice(fullText);
61 void SetFullText(
const UTF8StringSlice& fullText) { utf8FullText = fullText; }
63 void SetWordMinLength(
const LengthType _wordMinLength) {
64 wordMinLength = _wordMinLength;
67 void SetWordMaxLength(
const LengthType _wordMaxLength) {
68 wordMaxLength = _wordMaxLength;
71 void SetPrefixSetLength(
const LengthType _prefixSetLength) {
72 prefixSetLength = _prefixSetLength;
75 void SetSuffixSetLength(
const LengthType _suffixSetLength) {
76 suffixSetLength = _suffixSetLength;
80 void SetPreCalculationFilter(
81 const std::function<
bool(
const PhraseExtract&,
82 const UTF8StringSlice8Bit&)>& filter) {
83 preCalculationFilter = filter;
86 void SetPostCalculationFilter(
87 const std::function<
bool(
const PhraseExtract&,
88 const UTF8StringSlice8Bit&)>& filter) {
89 postCalculationFilter = filter;
92 void ReleaseSuffixes() { std::vector<UTF8StringSlice8Bit>().swap(suffixes); }
94 void ReleasePrefixes() { std::vector<UTF8StringSlice8Bit>().swap(prefixes); }
96 const std::vector<UTF8StringSlice8Bit>& Words()
const {
return words; }
98 const std::vector<UTF8StringSlice8Bit>& WordCandidates()
const {
99 return wordCandidates;
105 double suffixEntropy;
106 double prefixEntropy;
109 const Signals& Signal(
const UTF8StringSlice8Bit& wordCandidate)
const;
111 double Cohesion(
const UTF8StringSlice8Bit& wordCandidate)
const;
113 double Entropy(
const UTF8StringSlice8Bit& wordCandidate)
const;
115 double SuffixEntropy(
const UTF8StringSlice8Bit& wordCandidate)
const;
117 double PrefixEntropy(
const UTF8StringSlice8Bit& wordCandidate)
const;
119 size_t Frequency(
const UTF8StringSlice8Bit& word)
const;
121 double Probability(
const UTF8StringSlice8Bit& word)
const;
123 double LogProbability(
const UTF8StringSlice8Bit& word)
const;
127 void ExtractSuffixes();
129 void ExtractPrefixes();
131 void ExtractWordCandidates();
133 void CalculateFrequency();
135 void CalculateCohesions();
137 void CalculateSuffixEntropy();
139 void CalculatePrefixEntropy();
144 DefaultPreCalculationFilter(
const PhraseExtract&,
145 const PhraseExtract::UTF8StringSlice8Bit&);
148 DefaultPostCalculationFilter(
const PhraseExtract&,
149 const PhraseExtract::UTF8StringSlice8Bit&);
155 double PMI(
const UTF8StringSlice8Bit& wordCandidate,
156 const UTF8StringSlice8Bit& part1,
157 const UTF8StringSlice8Bit& part2)
const;
159 double CalculateCohesion(
const UTF8StringSlice8Bit& wordCandidate)
const;
161 double CalculateEntropy(
162 const std::unordered_map<UTF8StringSlice8Bit,
size_t,
163 UTF8StringSlice8Bit::Hasher>& choices)
const;
165 LengthType wordMinLength;
166 LengthType wordMaxLength;
167 LengthType prefixSetLength;
168 LengthType suffixSetLength;
169 std::function<bool(
const PhraseExtract&,
const UTF8StringSlice8Bit&)>
170 preCalculationFilter;
171 std::function<bool(
const PhraseExtract&,
const UTF8StringSlice8Bit&)>
172 postCalculationFilter;
174 bool prefixesExtracted;
175 bool suffixesExtracted;
176 bool frequenciesCalculated;
177 bool wordCandidatesExtracted;
178 bool cohesionsCalculated;
179 bool prefixEntropiesCalculated;
180 bool suffixEntropiesCalculated;
183 UTF8StringSlice utf8FullText;
184 size_t totalOccurrence;
185 double logTotalOccurrence;
186 std::vector<UTF8StringSlice8Bit> prefixes;
187 std::vector<UTF8StringSlice8Bit> suffixes;
188 std::vector<UTF8StringSlice8Bit> wordCandidates;
189 std::vector<UTF8StringSlice8Bit> words;
192 friend class PhraseExtractTest;