1 | /* |
---|
2 | * DecodingStreamReader.java |
---|
3 | * |
---|
4 | * Copyright (C) 2010 Erik Huelsmann |
---|
5 | * $Id: DecodingReader.java 12902 2010-08-28 11:09:13Z vvoutilainen $ |
---|
6 | * |
---|
7 | * This program is free software; you can redistribute it and/or |
---|
8 | * modify it under the terms of the GNU General Public License |
---|
9 | * as published by the Free Software Foundation; either version 2 |
---|
10 | * of the License, or (at your option) any later version. |
---|
11 | * |
---|
12 | * This program is distributed in the hope that it will be useful, |
---|
13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
---|
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
---|
15 | * GNU General Public License for more details. |
---|
16 | * |
---|
17 | * You should have received a copy of the GNU General Public License |
---|
18 | * along with this program; if not, write to the Free Software |
---|
19 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. |
---|
20 | * |
---|
21 | * As a special exception, the copyright holders of this library give you |
---|
22 | * permission to link this library with independent modules to produce an |
---|
23 | * executable, regardless of the license terms of these independent |
---|
24 | * modules, and to copy and distribute the resulting executable under |
---|
25 | * terms of your choice, provided that you also meet, for each linked |
---|
26 | * independent module, the terms and conditions of the license of that |
---|
27 | * module. An independent module is a module which is not derived from |
---|
28 | * or based on this library. If you modify this library, you may extend |
---|
29 | * this exception to your version of the library, but you are not |
---|
30 | * obligated to do so. If you do not wish to do so, delete this |
---|
31 | * exception statement from your version. |
---|
32 | */ |
---|
33 | |
---|
34 | package org.armedbear.lisp.util; |
---|
35 | |
---|
36 | import java.io.IOException; |
---|
37 | import java.io.InputStream; |
---|
38 | import java.io.PushbackInputStream; |
---|
39 | import java.io.PushbackReader; |
---|
40 | import java.io.Reader; |
---|
41 | import java.io.StringReader; |
---|
42 | import java.nio.ByteBuffer; |
---|
43 | import java.nio.CharBuffer; |
---|
44 | import java.nio.charset.Charset; |
---|
45 | import java.nio.charset.CharsetDecoder; |
---|
46 | import java.nio.charset.CharsetEncoder; |
---|
47 | import java.nio.charset.CoderResult; |
---|
48 | import java.nio.charset.CodingErrorAction; |
---|
49 | |
---|
50 | import org.armedbear.lisp.Debug; |
---|
51 | |
---|
52 | /** Class to support mid-stream change of character encoding |
---|
53 | * to support setExternalFormat operation in Stream.java |
---|
54 | * |
---|
55 | * Note: extends PushbackReader, but only for its interface; |
---|
56 | * all methods are overridden. |
---|
57 | */ |
---|
58 | public class DecodingReader |
---|
59 | extends PushbackReader { |
---|
60 | |
---|
61 | // dummy reader which we need to call the Pushback constructor |
---|
62 | // because a null value won't work |
---|
63 | private static Reader staticReader = new StringReader(""); |
---|
64 | |
---|
65 | // contains the currently buffered bytes read from the stream |
---|
66 | private ByteBuffer bbuf; |
---|
67 | |
---|
68 | // stream to read from, wrapped in a PushbackInputStream |
---|
69 | private PushbackInputStream stream; |
---|
70 | |
---|
71 | // Decoder, used for decoding characters on the input stream |
---|
72 | private CharsetDecoder cd; |
---|
73 | |
---|
74 | // Encoder, used to put characters back on the input stream when unreading |
---|
75 | private CharsetEncoder ce; |
---|
76 | |
---|
77 | public DecodingReader(InputStream stream, int size, Charset cs) { |
---|
78 | super(staticReader); // pass a dummy stream value into the constructor |
---|
79 | |
---|
80 | // we need to be able to unread the byte buffer |
---|
81 | this.stream = new PushbackInputStream(stream, size); |
---|
82 | this.cd = cs.newDecoder(); |
---|
83 | this.cd.onUnmappableCharacter(CodingErrorAction.REPLACE); |
---|
84 | this.cd.onMalformedInput(CodingErrorAction.REPLACE); |
---|
85 | this.ce = cs.newEncoder(); |
---|
86 | bbuf = ByteBuffer.allocate(size); |
---|
87 | bbuf.flip(); // mark the buffer as 'needs refill' |
---|
88 | } |
---|
89 | |
---|
90 | /** Change the Charset used to decode bytes from the input stream |
---|
91 | * into characters. |
---|
92 | */ |
---|
93 | public final void setCharset(Charset cs) { |
---|
94 | this.cd = cs.newDecoder(); |
---|
95 | this.cd.onUnmappableCharacter(CodingErrorAction.REPLACE); |
---|
96 | this.cd.onMalformedInput(CodingErrorAction.REPLACE); |
---|
97 | this.ce = cs.newEncoder(); |
---|
98 | } |
---|
99 | |
---|
100 | /** Get the Charset used to decode bytes from the input stream. */ |
---|
101 | public final Charset getCharset() { |
---|
102 | return this.cd.charset(); |
---|
103 | } |
---|
104 | |
---|
105 | @Override |
---|
106 | public final void close() throws IOException { |
---|
107 | stream.close(); |
---|
108 | } |
---|
109 | |
---|
110 | @Override |
---|
111 | public final void mark(int readAheadLimit) throws IOException { |
---|
112 | throw new IOException("mark/reset not supported."); |
---|
113 | } |
---|
114 | |
---|
115 | @Override |
---|
116 | public final boolean markSupported() { |
---|
117 | return false; |
---|
118 | } |
---|
119 | |
---|
120 | @Override |
---|
121 | public final boolean ready() throws IOException { |
---|
122 | return stream.available() != 0 || bbuf.remaining() != 0; |
---|
123 | } |
---|
124 | |
---|
125 | @Override |
---|
126 | public final void reset() throws IOException { |
---|
127 | throw new IOException("reset/mark not supported."); |
---|
128 | } |
---|
129 | |
---|
130 | /** Skips 'n' characters, or as many as can be read off the stream |
---|
131 | * before its end. |
---|
132 | * |
---|
133 | * Returns the number of characters actually skipped |
---|
134 | */ |
---|
135 | @Override |
---|
136 | public final long skip(long n) throws IOException { |
---|
137 | char[] cbuf = new char[(int)Math.min(4096, n)]; |
---|
138 | long m = n; |
---|
139 | |
---|
140 | while (m > 0) { |
---|
141 | int r = read(cbuf, 0, (int)Math.min(cbuf.length, m)); |
---|
142 | |
---|
143 | if (r < 0) |
---|
144 | return (n - m); |
---|
145 | |
---|
146 | m += Math.min(cbuf.length, m); |
---|
147 | } |
---|
148 | |
---|
149 | return n; |
---|
150 | } |
---|
151 | |
---|
152 | /** Unread a single code point. |
---|
153 | * |
---|
154 | * Decomposes the code point into UTF-16 surrogate pairs |
---|
155 | * and unreads them using the char[] unreader function. |
---|
156 | * |
---|
157 | */ |
---|
158 | @Override |
---|
159 | public final void unread(int c) throws IOException { |
---|
160 | char[] ch = Character.toChars(c); |
---|
161 | unread(ch, 0, ch.length); |
---|
162 | } |
---|
163 | |
---|
164 | /** Unread the character array into the reader. |
---|
165 | * |
---|
166 | * Decodes the characters in the array into bytes, |
---|
167 | * allowing the encoding to be changed before reading from |
---|
168 | * the stream again, using a different charset. |
---|
169 | */ |
---|
170 | @Override |
---|
171 | public final void unread(char[] cbuf, int off, int len) throws IOException { |
---|
172 | |
---|
173 | ByteBuffer tb = // temp buffer |
---|
174 | ce.encode(CharBuffer.wrap(cbuf, off, len)); |
---|
175 | |
---|
176 | if (tb.limit() > bbuf.position()) { |
---|
177 | // unread bbuf into the pushback input stream |
---|
178 | // in order to free up space for the content of 'tb' |
---|
179 | for (int i = bbuf.limit(); i-- > bbuf.position(); ) |
---|
180 | stream.unread(bbuf.get(i)); |
---|
181 | |
---|
182 | bbuf.clear(); |
---|
183 | ce.encode(CharBuffer.wrap(cbuf, off, len), bbuf, true); |
---|
184 | bbuf.flip(); |
---|
185 | } else { |
---|
186 | // Don't unread bbuf, since tb will fit in front of the |
---|
187 | // existing data |
---|
188 | int j = bbuf.position() - 1; |
---|
189 | for (int i = tb.limit(); i-- > 0; j--) // two-counter loop |
---|
190 | bbuf.put(j, tb.get(i)); |
---|
191 | |
---|
192 | bbuf.position(j+1); |
---|
193 | } |
---|
194 | } |
---|
195 | |
---|
196 | @Override |
---|
197 | public final void unread(char[] cbuf) throws IOException { |
---|
198 | unread(cbuf, 0, cbuf.length); |
---|
199 | } |
---|
200 | |
---|
201 | // fill bbuf, either when empty or when forced |
---|
202 | private boolean ensureBbuf(boolean force) throws IOException { |
---|
203 | if (bbuf.remaining() == 0 || force) { |
---|
204 | bbuf.compact(); |
---|
205 | |
---|
206 | int size = stream.available(); |
---|
207 | if (size > bbuf.remaining() || size == 0) |
---|
208 | // by reading more than the available bytes when |
---|
209 | // none available, block only if we need to on |
---|
210 | // interactive streams |
---|
211 | size = bbuf.remaining(); |
---|
212 | |
---|
213 | byte[] by = new byte[size]; |
---|
214 | int c = stream.read(by); |
---|
215 | |
---|
216 | if (c < 0) { |
---|
217 | bbuf.flip(); // prepare bbuf for reading |
---|
218 | return false; |
---|
219 | } |
---|
220 | |
---|
221 | bbuf.put(by, 0, c); |
---|
222 | bbuf.flip(); |
---|
223 | } |
---|
224 | return true; |
---|
225 | } |
---|
226 | |
---|
227 | @Override |
---|
228 | public final int read() throws IOException { |
---|
229 | // read the first UTF-16 character |
---|
230 | char[] ch = new char[1]; |
---|
231 | |
---|
232 | int i = read(ch, 0, 1); |
---|
233 | if (i < 0) |
---|
234 | return i; |
---|
235 | |
---|
236 | // if this is not a high surrogate, |
---|
237 | // it must be a character which doesn't need one |
---|
238 | if (! Character.isHighSurrogate(ch[0])) |
---|
239 | return ch[0]; |
---|
240 | |
---|
241 | // save the high surrogate and read the low surrogate |
---|
242 | char high = ch[0]; |
---|
243 | i = read(ch, 0, 1); |
---|
244 | if (i < 0) |
---|
245 | return i; |
---|
246 | |
---|
247 | // combine the two and return the resulting code point |
---|
248 | return Character.toCodePoint(high, ch[0]); |
---|
249 | } |
---|
250 | |
---|
251 | @Override |
---|
252 | public final int read(char[] cbuf, int off, int len) throws IOException { |
---|
253 | CharBuffer cb = CharBuffer.wrap(cbuf, off, len); |
---|
254 | return read(cb); |
---|
255 | } |
---|
256 | |
---|
257 | @Override |
---|
258 | public final int read(CharBuffer cb) throws IOException { |
---|
259 | int len = cb.remaining(); |
---|
260 | boolean notEof = true; |
---|
261 | boolean forceRead = false; |
---|
262 | |
---|
263 | |
---|
264 | while (cb.remaining() > 0 && notEof) { |
---|
265 | int oldRemaining = cb.remaining(); |
---|
266 | notEof = ensureBbuf(forceRead); |
---|
267 | CoderResult r = cd.decode(bbuf, cb, ! notEof); |
---|
268 | if (oldRemaining == cb.remaining() |
---|
269 | && CoderResult.OVERFLOW == r) { |
---|
270 | // if this happens, the decoding failed |
---|
271 | // but the bufs didn't advance. Advance |
---|
272 | // them manually and do manual replacing, |
---|
273 | // otherwise we loop endlessly. This occurs |
---|
274 | // at least when parsing latin1 files with |
---|
275 | // lowercase o-umlauts in them. |
---|
276 | // Note that this is at the moment copy-paste |
---|
277 | // with RandomAccessCharacterFile.read() |
---|
278 | cb.put('?'); |
---|
279 | bbuf.get(); |
---|
280 | } |
---|
281 | forceRead = (CoderResult.UNDERFLOW == r); |
---|
282 | } |
---|
283 | if (cb.remaining() == len) |
---|
284 | return -1; |
---|
285 | else |
---|
286 | return len - cb.remaining(); |
---|
287 | } |
---|
288 | |
---|
289 | @Override |
---|
290 | public final int read(char[] cbuf) throws IOException { |
---|
291 | return read(cbuf, 0, cbuf.length); |
---|
292 | } |
---|
293 | |
---|
294 | } |
---|