Open Chinese Convert 1.1.9
A project for conversion between Traditional and Simplified Chinese
Loading...
Searching...
No Matches
UTF8StringSlice.hpp
1/*
2 * Open Chinese Convert
3 *
4 * Copyright 2015 Carbo Kuo <byvoid@byvoid.com>
5 *
6 * Licensed under the Apache License, Version 2.0 (the "License");
7 * you may not use this file except in compliance with the License.
8 * You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an "AS IS" BASIS,
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
17 */
18
19#include <cstring>
20
21#include "Common.hpp"
22#include "UTF8Util.hpp"
23
24namespace opencc {
25
26namespace internal {
27
28inline size_t FNVHash(const char* text, const size_t byteLength,
29 const size_t FNV_prime, const size_t FNV_offset_basis) {
30 size_t hash = FNV_offset_basis;
31 for (const char* pstr = text; pstr < text + byteLength; pstr++) {
32 hash ^= *pstr;
33 hash *= FNV_prime;
34 }
35 return hash;
36}
37
38template <int> size_t FNVHash(const char* text, const size_t byteLength);
39
40template <>
41inline size_t FNVHash<4>(const char* text, const size_t byteLength) {
42 return FNVHash(text, byteLength, 16777619UL, 2166136261UL);
43}
44
45#if SIZE_MAX == 0xffffffffffffffff
46template <>
47inline size_t FNVHash<8>(const char* text, const size_t byteLength) {
48 return FNVHash(text, byteLength, 1099511628211UL, 14695981039346656037UL);
49}
50#endif
51
52} // namespace internal
53
54template <typename LENGTH_TYPE> class UTF8StringSliceBase {
55public:
56 typedef LENGTH_TYPE LengthType;
57
58 UTF8StringSliceBase(const char* _str)
59 : str(_str), utf8Length(static_cast<LengthType>(UTF8Util::Length(_str))),
60 byteLength(static_cast<LengthType>(strlen(_str))) {}
61
62 UTF8StringSliceBase(const char* _str, const LengthType _utf8Length)
63 : str(_str), utf8Length(_utf8Length) {
64 CalculateByteLength();
65 }
66
67 UTF8StringSliceBase(const char* _str, const LengthType _utf8Length,
68 const LengthType _byteLength)
69 : str(_str), utf8Length(_utf8Length), byteLength(_byteLength) {
70 CalculateByteLength();
71 }
72
73 LengthType UTF8Length() const { return utf8Length; }
74
75 LengthType ByteLength() const { return byteLength; }
76
77 UTF8StringSliceBase Left(const LengthType numberOfCharacters) const {
78 if (numberOfCharacters == UTF8Length()) {
79 return *this;
80 } else {
81 return UTF8StringSliceBase(str, numberOfCharacters);
82 }
83 }
84
85 UTF8StringSliceBase Right(const LengthType numberOfCharacters) const {
86 if (numberOfCharacters == UTF8Length()) {
87 return *this;
88 } else {
89 const char* pstr = str + byteLength;
90 for (size_t i = 0; i < numberOfCharacters; i++) {
91 pstr = UTF8Util::PrevChar(pstr);
92 }
93 return UTF8StringSliceBase(pstr, numberOfCharacters);
94 }
95 }
96
97 UTF8StringSliceBase SubString(const LengthType offset,
98 const LengthType numberOfCharacters) const {
99 if (offset == 0) {
100 return Left(numberOfCharacters);
101 } else {
102 const char* pstr = str;
103 for (size_t i = 0; i < offset; i++) {
104 pstr = UTF8Util::NextChar(pstr);
105 }
106 return UTF8StringSliceBase(pstr, numberOfCharacters);
107 }
108 }
109
110 std::string ToString() const { return std::string(str, str + byteLength); }
111
112 const char* CString() const { return str; }
113
114 LengthType CommonPrefixLength(const UTF8StringSliceBase& that) const {
115 if (str == that.str) {
116 return (std::min)(utf8Length, that.utf8Length);
117 } else {
118 const char* pstr1 = str;
119 const char* pstr2 = that.str;
120 for (size_t length = 0; length < utf8Length && length < that.utf8Length;
121 length++) {
122 size_t charLen1 = UTF8Util::NextCharLength(pstr1);
123 size_t charLen2 = UTF8Util::NextCharLength(pstr2);
124 if (charLen1 != charLen2 || strncmp(pstr1, pstr2, charLen1) != 0) {
125 return length;
126 }
127 pstr1 += charLen1;
128 pstr2 += charLen2;
129 }
130 return 0;
131 }
132 }
133
134 void MoveRight() {
135 if (utf8Length > 0) {
136 const size_t charLen = UTF8Util::NextCharLength(str);
137 str += charLen;
138 utf8Length--;
139 byteLength -= charLen;
140 }
141 }
142
143 void MoveLeft() {
144 if (utf8Length > 0) {
145 const size_t charLen = UTF8Util::PrevCharLength(str + byteLength);
146 utf8Length--;
147 byteLength -= charLen;
148 }
149 }
150
151 int ReverseCompare(const UTF8StringSliceBase& that) const {
152 const char* pstr1 = str + byteLength;
153 const char* pstr2 = that.str + that.byteLength;
154 const size_t length = (std::min)(utf8Length, that.utf8Length);
155 for (size_t i = 0; i < length; i++) {
156 const size_t charLen1 = UTF8Util::PrevCharLength(pstr1);
157 const size_t charLen2 = UTF8Util::PrevCharLength(pstr2);
158 pstr1 -= charLen1;
159 pstr2 -= charLen2;
160 const int cmp = strncmp(pstr1, pstr2, (std::min)(charLen1, charLen2));
161 if (cmp < 0) {
162 return -1;
163 } else if (cmp > 0) {
164 return 1;
165 } else if (charLen1 < charLen2) {
166 return -1;
167 } else if (charLen1 > charLen2) {
168 return 1;
169 }
170 }
171 if (utf8Length < that.utf8Length) {
172 return -1;
173 } else if (utf8Length > that.utf8Length) {
174 return 1;
175 } else {
176 return 0;
177 }
178 }
179
180 LengthType FindBytePosition(const UTF8StringSliceBase& pattern) const {
181 return static_cast<LengthType>(
182 ToString().find(pattern.str, 0, pattern.byteLength));
183 }
184
185 bool operator<(const UTF8StringSliceBase& that) const {
186 return Compare(that) < 0;
187 }
188
189 bool operator>(const UTF8StringSliceBase& that) const {
190 return Compare(that) > 0;
191 }
192
193 bool operator==(const UTF8StringSliceBase& that) const {
194 return (str == that.str && utf8Length == that.utf8Length) ||
195 Compare(that) == 0;
196 }
197
198 bool operator!=(const UTF8StringSliceBase& that) const {
199 return !this->operator==(that);
200 }
201
202 class Hasher {
203 public:
204 size_t operator()(const UTF8StringSliceBase& text) const {
205 return internal::FNVHash<sizeof(size_t)>(text.CString(),
206 text.ByteLength());
207 }
208 };
209
210private:
211 inline int Compare(const UTF8StringSliceBase& that) const {
212 int cmp = strncmp(str, that.str, (std::min)(byteLength, that.byteLength));
213 if (cmp == 0) {
214 if (utf8Length < that.utf8Length) {
215 cmp = -1;
216 } else if (utf8Length > that.utf8Length) {
217 cmp = 1;
218 } else {
219 cmp = 0;
220 }
221 }
222 return cmp;
223 }
224
225 void CalculateByteLength() {
226 const char* pstr = str;
227 for (size_t i = 0; i < utf8Length; i++) {
228 pstr = UTF8Util::NextChar(pstr);
229 }
230 byteLength = static_cast<LengthType>(pstr - str);
231 }
232
233 const char* str;
234 LengthType utf8Length;
235 LengthType byteLength;
236};
237
238typedef UTF8StringSliceBase<size_t> UTF8StringSlice;
239
240template <typename LENGTH_TYPE>
241std::ostream& operator<<(::std::ostream& os,
242 const UTF8StringSliceBase<LENGTH_TYPE>& str) {
243 return os << str.ToString();
244}
245
246} // namespace opencc
Definition UTF8StringSlice.hpp:202
Definition UTF8StringSlice.hpp:54
static size_t PrevCharLength(const char *str)
Returns the length in byte for the previous UTF8 character.
Definition UTF8Util.hpp:81
static size_t Length(const char *str)
Returns the UTF8 length of a valid UTF8 std::string.
Definition UTF8Util.hpp:126
static size_t NextCharLength(const char *str)
Returns the length in byte for the next UTF8 character.
Definition UTF8Util.hpp:70
static const char * PrevChar(const char *str)
Move the char* pointer before the previous UTF8 character.
Definition UTF8Util.hpp:119
static const char * NextChar(const char *str)
Returns the char* pointer over the next UTF8 character.
Definition UTF8Util.hpp:112