c++-gtk-utils
reassembler.h
Go to the documentation of this file.
1 /* Copyright (C) 2005 to 2010 Chris Vine
2 
3 The library comprised in this file or of which this file is part is
4 distributed by Chris Vine under the GNU Lesser General Public
5 License as follows:
6 
7  This library is free software; you can redistribute it and/or
8  modify it under the terms of the GNU Lesser General Public License
9  as published by the Free Software Foundation; either version 2.1 of
10  the License, or (at your option) any later version.
11 
12  This library is distributed in the hope that it will be useful, but
13  WITHOUT ANY WARRANTY; without even the implied warranty of
14  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15  Lesser General Public License, version 2.1, for more details.
16 
17  You should have received a copy of the GNU Lesser General Public
18  License, version 2.1, along with this library (see the file LGPL.TXT
19  which came with this source code package in the c++-gtk-utils
20  sub-directory); if not, write to the Free Software Foundation, Inc.,
21  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
22 
23 */
24 
25 #ifndef CGU_REASSEMBLER_H
26 #define CGU_REASSEMBLER_H
27 
30 
31 namespace Cgu {
32 
33 namespace Utf8 {
34 
35 
36 /**
37  * @class Reassembler reassembler.h c++-gtk-utils/reassembler.h
38  * @brief A class for reassembling UTF-8 strings sent over pipes and
39  * sockets so they form complete valid UTF-8 characters.
40  *
41  * Utf8::Reassembler is a functor class which takes in a partially
42  * formed UTF-8 string and returns a nul-terminated string comprising
43  * such of the input string (after inserting, at the beginning, any
44  * partially formed UTF-8 character which was at the end of the input
45  * string passed in previous calls to the functor) as forms complete
46  * UTF-8 characters (storing any partial character at the end for the
47  * next call to the functor). If the input string contains invalid
48  * UTF-8 after adding any stored previous part character (apart from
49  * any partially formed character at the end of the input string) then
50  * operator() will return a null Cgu::SharedHandle<char*> object (that
51  * is, Cgu::SharedHandle<char*>::get() will return 0). Such input
52  * will not be treated as invalid if it consists only of a single
53  * partly formed UTF-8 character which could be valid if further bytes
54  * were received and added to it. In that case the returned
55  * SharedHandle<char*> object will contain an allocated string of zero
56  * length, comprising only a terminating \0 character, rather than a
57  * NULL pointer.
58  *
59  * This enables UTF-8 strings to be sent over pipes, sockets, etc and
60  * displayed in a GTK+ object at the receiving end
61  *
62  * Note that for efficiency reasons the memory held in the returned
63  * Cgu::SharedHandle<char*> object may be greater than the length of
64  * the nul-terminated string that is contained in that memory: just
65  * let the Cgu::SharedHandle<char*> object manage the memory, and use
66  * the contents like any other nul-terminated string.
67  *
68  * This class is not needed if std::getline(), with its default '\\n'
69  * delimiter, is used to read UTF-8 characters using, say,
70  * Cgu::fdistream, because a whole '\\n' delimited line of UTF-8
71  * characters will always be complete.
72  *
73  * This is an example of its use, reading from a pipe until it is
74  * closed by the writer and putting the received text in a
75  * GtkTextBuffer object:
76  * @code
77  * using namespace Cgu;
78  *
79  * GtkTextIter end;
80  * GtkTextBuffer* text_buffer = gtk_text_view_get_buffer(GTK_TEXT_VIEW(text_view));
81  * gtk_text_buffer_get_end_iter(text_buffer, &end);
82  *
83  * Utf8::Reassembler reassembler;
84  * const int BSIZE = 1024;
85  * char read_buffer[BSIZE];
86  * ssize_t res;
87  * do {
88  * res = ::read(fd, read_buffer, BSIZE);
89  * if (res > 0) {
90  * SharedHandle<char*> utf8(reassembler(read_buffer, res));
91  * if (utf8.get()) {
92  * gtk_text_buffer_insert(text_buffer, &end,
93  * utf8.get(), std::strlen(utf8));
94  * }
95  * else std::cerr << "Invalid utf8 text sent over pipe\n";
96  * }
97  * } while (res && (res != -1 || errno == EINTR));
98  * @endcode
99  *
100  * This class maintains an array as a data member, containing partly
101  * formed characters from previous calls to operator(), and should not
102  * be copied. There should be no reason to do so, but unfortunately
103  * enforcing this by explicitly precluding copy construction and copy
104  * assignment was overlooked when this class was first provided. At
105  * the next API break, the copy constructor will be explicitly deleted
106  * and moving only allowed. Where a Reassembler object is to be
107  * moved, use std::move and the code will be safe against this change
108  * in the future.
109  */
110 
111 class Reassembler {
112  size_t stored;
113  const static size_t buff_size = 6;
114  char buffer[buff_size];
115  char* join_buffer(const char*, size_t);
116 public:
117 /**
118  * Takes a byte array of wholly or partly formed UTF-8 characters to
119  * be converted (after taking account of previous calls to the method)
120  * to a valid string of wholly formed characters.
121  * @param input The input array.
122  * @param size The number of bytes in the input (not the number of
123  * UTF-8 characters).
124  * @return A Cgu::SharedHandle<char*> object holding a nul-terminated
125  * string comprising such of the input (after inserting, at the
126  * beginning, any partially formed UTF-8 character which was at the
127  * end of the input passed in previous calls to the functor) as forms
128  * complete UTF-8 characters (storing any partial character at the end
129  * for the next call to the functor). If the input is invalid after
130  * such recombination, then a null Cgu::SharedHandle<char*> object is
131  * returned (that is, Cgu::SharedHandle<char*>::get() will return 0).
132  * Such input will not be treated as invalid if it consists only of a
133  * single partly formed UTF-8 character which could be valid if
134  * further bytes were received and added to it. In that case the
135  * returned Cgu::SharedHandle<char*> object will contain an allocated
136  * string of zero length, comprising only a terminating \0 character,
137  * rather than a NULL pointer.
138  * @exception std::bad_alloc The method might throw std::bad_alloc if
139  * memory is exhausted and the system throws in that case. It will
140  * not throw any other exception.
141  */
142  Cgu::SharedHandle<char*> operator()(const char* input, size_t size);
143 
144 /**
145  * Gets the number of bytes of a partially formed UTF-8 character
146  * stored for the next call to operator()(). It will not throw.
147  * @return The number of bytes.
148  */
149  size_t get_stored() const {return stored;}
150 
151 /**
152  * Resets the Reassembler, by discarding any partially formed UTF-8
153  * character from previous calls to operator()(). It will not throw.
154  */
155  void reset() {stored = 0;}
156 
157 /**
158  * The constructor will not throw.
159  */
160  Reassembler(): stored(0) {}
161 
162  // TODO: At the next API break, provide a default and move
163  // constructor and move assignment operator, and omit a copy
164  // constructor and copy assignment operator: this class maintains an
165  // array as a data member
166 
167 /* Only has effect if --with-glib-memory-slices-compat or
168  * --with-glib-memory-slices-no-compat option picked */
170 };
171 
172 } // namespace Utf8
173 
174 } // namespace Cgu
175 
176 #endif