liblcf
Loading...
Searching...
No Matches
reader_util.cpp
Go to the documentation of this file.
1/*
2 * This file is part of liblcf. Copyright (c) 2021 liblcf authors.
3 * https://github.com/EasyRPG/liblcf - https://easyrpg.org
4 *
5 * liblcf is Free/Libre Open Source Software, released under the MIT License.
6 * For the full copyright and license information, please view the COPYING
7 * file that was distributed with this source code.
8 */
9
10#include "lcf/config.h"
11#include "lcf/scope_guard.h"
12
13#if LCF_SUPPORT_ICU
14# include <unicode/ucsdet.h>
15# include <unicode/ucnv.h>
16# include <unicode/normalizer2.h>
17# include <unicode/unistr.h>
18#else
19# ifdef _MSC_VER
20# error MSVC builds require ICU
21# endif
22#endif
23
24#ifdef _WIN32
25# include <windows.h>
26#else
27# if !LCF_SUPPORT_ICU
28# include <iconv.h>
29# endif
30# include <locale>
31#endif
32
33#include <algorithm>
34#include <cstdio>
35#include <cstdlib>
36#include <sstream>
37#include <vector>
38
39#include "lcf/inireader.h"
40#include "lcf/ldb/reader.h"
41#include "lcf/reader_util.h"
42
43namespace lcf {
44
45namespace ReaderUtil {
46}
47
48std::string ReaderUtil::CodepageToEncoding(int codepage) {
49 if (codepage == 0)
50 return std::string();
51
52 if (codepage == 932) {
53#if LCF_SUPPORT_ICU
54 return "ibm-943_P15A-2003";
55#else
56 return "SHIFT_JIS";
57#endif
58 }
59 if (codepage == 949) {
60#if LCF_SUPPORT_ICU
61 return "windows-949-2000";
62#else
63 return "cp949";
64#endif
65 }
66 std::ostringstream out;
67#if LCF_SUPPORT_ICU
68 out << "windows-" << codepage;
69#else
70 out << "CP" << codepage;
71#endif
72
73 // Looks like a valid codepage
74 std::string outs = out.str();
75 return outs;
76}
77
78std::string ReaderUtil::DetectEncoding(lcf::rpg::Database& db) {
79 std::vector<std::string> encodings = DetectEncodings(db);
80
81 if (encodings.empty()) {
82 return "";
83 }
84
85 return encodings.front();
86}
87
88std::vector<std::string> ReaderUtil::DetectEncodings(lcf::rpg::Database& db) {
89#if LCF_SUPPORT_ICU
90 std::ostringstream text;
91
92 auto append = [](const auto& s) {
93 return ToString(s) + " ";
94 };
95
96 lcf::rpg::ForEachString(db.system, [&](const auto& val, const auto& ctx) {
97 text << append(val);
98 });
99
100 // Cannot use ForEachString here for Terms:
101 // Too much untranslated garbage data in there, even in default database
102 for (const auto& s: {
103 db.terms.menu_save,
104 db.terms.menu_quit,
105 db.terms.new_game,
106 db.terms.load_game,
107 db.terms.exit_game,
108 db.terms.status,
109 db.terms.row,
110 db.terms.order,
111 db.terms.wait_on,
112 db.terms.wait_off,
113 db.terms.level,
114 db.terms.health_points,
115 db.terms.spirit_points,
116 db.terms.normal_status,
117 db.terms.sp_cost,
118 db.terms.attack,
119 db.terms.defense,
120 db.terms.spirit,
121 db.terms.agility,
122 db.terms.weapon,
123 db.terms.shield,
124 db.terms.armor,
125 db.terms.helmet,
126 db.terms.accessory,
127 db.terms.save_game_message,
128 db.terms.load_game_message,
129 db.terms.exit_game_message,
130 db.terms.file,
131 db.terms.yes,
132 db.terms.no
133 }) {
134 text << append(s);
135 }
136
137 return ReaderUtil::DetectEncodings(text.str());
138#else
139 return std::vector<std::string>();
140#endif
141}
142
143std::string ReaderUtil::DetectEncoding(StringView string) {
144 std::vector<std::string> encodings = DetectEncodings(string);
145
146 if (encodings.empty()) {
147 return "";
148 }
149
150 return encodings.front();
151}
152
153std::vector<std::string> ReaderUtil::DetectEncodings(StringView string) {
154std::vector<std::string> encodings;
155#if LCF_SUPPORT_ICU
156 if (!string.empty()) {
157 UErrorCode status = U_ZERO_ERROR;
158 UCharsetDetector* detector = ucsdet_open(&status);
159
160 auto s = std::string(string);
161 ucsdet_setText(detector, s.c_str(), s.length(), &status);
162
163 int32_t matches_count;
164 const UCharsetMatch** matches = ucsdet_detectAll(detector, &matches_count, &status);
165
166 if (matches != nullptr) {
167 // Collect all candidates, most confident comes first
168 for (int i = 0; i < matches_count; ++i) {
169 std::string encoding = ucsdet_getName(matches[i], &status);
170
171 // Fixes to ensure proper Windows encodings
172 if (encoding == "Shift_JIS") {
173 encodings.emplace_back("ibm-943_P15A-2003"); // Japanese with \ as backslash
174 } else if (encoding == "EUC-KR") {
175 encodings.emplace_back("windows-949-2000"); // Korean with \ as backlash
176 } else if (encoding == "GB18030") {
177 encodings.emplace_back("windows-936-2000"); // Simplified Chinese
178 } else if (encoding == "ISO-8859-1" || encoding == "windows-1252") {
179 encodings.emplace_back("ibm-5348_P100-1997"); // Occidental with Euro
180 } else if (encoding == "ISO-8859-2" || encoding == "windows-1250") {
181 encodings.emplace_back("ibm-5346_P100-1998"); // Central Europe with Euro
182 } else if (encoding == "ISO-8859-5" || encoding == "windows-1251") {
183 encodings.emplace_back("ibm-5347_P100-1998"); // Cyrillic with Euro
184 } else if (encoding == "ISO-8859-6" || encoding == "windows-1256") {
185 encodings.emplace_back("ibm-9448_X100-2005"); // Arabic with Euro + 8 chars
186 } else if (encoding == "ISO-8859-7" || encoding == "windows-1253") {
187 encodings.emplace_back("ibm-5349_P100-1998"); // Greek with Euro
188 } else if (encoding == "ISO-8859-8" || encoding == "windows-1255") {
189 encodings.emplace_back("ibm-9447_P100-2002"); // Hebrew with Euro
190 } else {
191 encodings.push_back(encoding);
192 }
193 }
194 }
195 ucsdet_close(detector);
196 }
197#endif
198
199 return encodings;
200}
201
202std::string ReaderUtil::GetEncoding(StringView ini_file) {
203 INIReader ini(ToString(ini_file));
204 if (ini.ParseError() != -1) {
205 std::string encoding = ini.Get("EasyRPG", "Encoding", std::string());
206 if (!encoding.empty()) {
207 return ReaderUtil::CodepageToEncoding(atoi(encoding.c_str()));
208 }
209 }
210 return std::string();
211}
212
213std::string ReaderUtil::GetEncoding(std::istream& filestream) {
214 INIReader ini(filestream);
215 if (ini.ParseError() != -1) {
216 std::string encoding = ini.Get("EasyRPG", "Encoding", std::string());
217 if (!encoding.empty()) {
218 return ReaderUtil::CodepageToEncoding(atoi(encoding.c_str()));
219 }
220 }
221 return std::string();
222}
223
224std::string ReaderUtil::GetLocaleEncoding() {
225#ifdef _WIN32
226 int codepage = GetACP();
227#elif __ANDROID__
228 // No std::locale support in NDK
229 // Doesn't really matter because the Android version auto-detects via ICU
230 int codepage = 1252;
231#else
232 int codepage = 1252;
233
234 std::locale loc = std::locale("");
235 // Gets the language and culture part only
236 std::string loc_full = loc.name().substr(0, loc.name().find_first_of("@."));
237 // Gets the language part only
238 std::string loc_lang = loc.name().substr(0, loc.name().find_first_of("_"));
239
240 if (loc_lang == "th") codepage = 874;
241 else if (loc_lang == "ja") codepage = 932;
242 else if (loc_full == "zh_CN" ||
243 loc_full == "zh_SG") codepage = 936;
244 else if (loc_lang == "ko") codepage = 949;
245 else if (loc_full == "zh_TW" ||
246 loc_full == "zh_HK") codepage = 950;
247 else if (loc_lang == "cs" ||
248 loc_lang == "hu" ||
249 loc_lang == "pl" ||
250 loc_lang == "ro" ||
251 loc_lang == "hr" ||
252 loc_lang == "sk" ||
253 loc_lang == "sl") codepage = 1250;
254 else if (loc_lang == "ru") codepage = 1251;
255 else if (loc_lang == "ca" ||
256 loc_lang == "da" ||
257 loc_lang == "de" ||
258 loc_lang == "en" ||
259 loc_lang == "es" ||
260 loc_lang == "fi" ||
261 loc_lang == "fr" ||
262 loc_lang == "it" ||
263 loc_lang == "nl" ||
264 loc_lang == "nb" ||
265 loc_lang == "pt" ||
266 loc_lang == "sv" ||
267 loc_lang == "eu") codepage = 1252;
268 else if (loc_lang == "el") codepage = 1253;
269 else if (loc_lang == "tr") codepage = 1254;
270 else if (loc_lang == "he") codepage = 1255;
271 else if (loc_lang == "ar") codepage = 1256;
272 else if (loc_lang == "et" ||
273 loc_lang == "lt" ||
274 loc_lang == "lv") codepage = 1257;
275 else if (loc_lang == "vi") codepage = 1258;
276#endif
277
278 return CodepageToEncoding(codepage);
279}
280
281std::string ReaderUtil::Recode(StringView str_to_encode, StringView source_encoding) {
282 return ReaderUtil::Recode(str_to_encode, source_encoding, "UTF-8");
283}
284
285std::string ReaderUtil::Recode(StringView str_to_encode,
286 StringView src_enc,
287 StringView dst_enc) {
288
289 if (src_enc.empty() || dst_enc.empty() || str_to_encode.empty()) {
290 return ToString(str_to_encode);
291 }
292
293 auto src_cp = SvAtoi(src_enc);
294 const auto& src_enc_str = src_cp > 0
295 ? ReaderUtil::CodepageToEncoding(src_cp)
296 : ToString(src_enc);
297
298 auto dst_cp = SvAtoi(dst_enc);
299 const auto& dst_enc_str = dst_cp > 0
300 ? ReaderUtil::CodepageToEncoding(dst_cp)
301 : ToString(dst_enc);
302
303#if LCF_SUPPORT_ICU
304 auto status = U_ZERO_ERROR;
305 auto conv_from = ucnv_open(src_enc_str.c_str(), &status);
306
307 if (status != U_ZERO_ERROR && status != U_AMBIGUOUS_ALIAS_WARNING) {
308 fprintf(stderr, "liblcf: ucnv_open() error for source encoding \"%s\": %s\n", src_enc_str.c_str(), u_errorName(status));
309 return std::string();
310 }
311 status = U_ZERO_ERROR;
312 auto conv_from_sg = makeScopeGuard([&]() { ucnv_close(conv_from); });
313
314 auto conv_to = ucnv_open(dst_enc_str.c_str(), &status);
315
316 if (status != U_ZERO_ERROR && status != U_AMBIGUOUS_ALIAS_WARNING) {
317 fprintf(stderr, "liblcf: ucnv_open() error for dest encoding \"%s\": %s\n", dst_enc_str.c_str(), u_errorName(status));
318 return std::string();
319 }
320 auto conv_to_sg = makeScopeGuard([&]() { ucnv_close(conv_to); });
321 status = U_ZERO_ERROR;
322
323 std::string result(str_to_encode.size() * 4, '\0');
324 auto* src = str_to_encode.data();
325 auto* dst = &result.front();
326
327 ucnv_convertEx(conv_to, conv_from,
328 &dst, dst + result.size(),
329 &src, src + str_to_encode.size(),
330 nullptr, nullptr, nullptr, nullptr,
331 true, true,
332 &status);
333
334 if (U_FAILURE(status)) {
335 fprintf(stderr, "liblcf: ucnv_convertEx() error when encoding \"%.*s\": %s\n", (int)str_to_encode.length(), str_to_encode.data(), u_errorName(status));
336 return std::string();
337 }
338
339 result.resize(dst - result.c_str());
340 result.shrink_to_fit();
341
342 return result;
343#else
344 iconv_t cd = iconv_open(dst_enc_str.c_str(), src_enc_str.c_str());
345 if (cd == (iconv_t)-1)
346 return ToString(str_to_encode);
347 char *src = const_cast<char *>(str_to_encode.data());
348 size_t src_left = str_to_encode.size();
349 size_t dst_size = str_to_encode.size() * 5 + 10;
350 char *dst = new char[dst_size];
351 size_t dst_left = dst_size;
352# ifdef ICONV_CONST
353 char ICONV_CONST *p = src;
354# else
355 char *p = src;
356# endif
357 char *q = dst;
358 size_t status = iconv(cd, &p, &src_left, &q, &dst_left);
359 iconv_close(cd);
360 if (status == (size_t) -1 || src_left > 0) {
361 delete[] dst;
362 return std::string();
363 }
364 *q++ = '\0';
365 std::string result(dst);
366 delete[] dst;
367 return result;
368#endif
369}
370
371std::string ReaderUtil::Normalize(StringView str) {
372#if LCF_SUPPORT_ICU
373 icu::UnicodeString uni = icu::UnicodeString(str.data(), str.length(), "utf-8").toLower();
374 UErrorCode err = U_ZERO_ERROR;
375 std::string res;
376 const icu::Normalizer2* norm = icu::Normalizer2::getNFKCInstance(err);
377 if (U_FAILURE(err)) {
378 static bool err_reported = false;
379 if (!err_reported) {
380 fprintf(stderr, "Normalizer2::getNFKCInstance failed (%s). \"nrm\" is probably missing in the ICU data file. Unicode normalization will not work!\n", u_errorName(err));
381 err_reported = true;
382 }
383 uni.toUTF8String(res);
384 return res;
385 }
386 icu::UnicodeString f = norm->normalize(uni, err);
387 if (U_FAILURE(err)) {
388 uni.toUTF8String(res);
389 } else {
390 f.toUTF8String(res);
391 }
392 return res;
393#else
394 auto result = std::string(str);
395 std::transform(result.begin(), result.end(), result.begin(), tolower);
396 return result;
397#endif
398}
399
400} //namespace lcf
Definition: dbarray.cpp:13