Context Navigation

source: branches/1.1.x/src/org/armedbear/lisp/util/DecodingReader.java

Visit:

Last change on this file was 12902, checked in by vvoutilainen, 15 years ago
Fix reading of data containing scandinavian latin1 characters correctly, and add a simple test for it. The utf-8 test is just a sanity test so that umlauts as utf-8 aren't broken, the latin1 test properly fails without this patch and passes with this patch.
Property svn:eol-style set to `native` Property svn:keywords set to `Id`
File size: 9.4 KB

Line
1	/*
2	* DecodingStreamReader.java
3	*
4	* Copyright (C) 2010 Erik Huelsmann
5	* $Id: DecodingReader.java 12902 2010-08-28 11:09:13Z vvoutilainen $
6	*
7	* This program is free software; you can redistribute it and/or
8	* modify it under the terms of the GNU General Public License
9	* as published by the Free Software Foundation; either version 2
10	* of the License, or (at your option) any later version.
11	*
12	* This program is distributed in the hope that it will be useful,
13	* but WITHOUT ANY WARRANTY; without even the implied warranty of
14	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15	* GNU General Public License for more details.
16	*
17	* You should have received a copy of the GNU General Public License
18	* along with this program; if not, write to the Free Software
19	* Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
20	*
21	* As a special exception, the copyright holders of this library give you
22	* permission to link this library with independent modules to produce an
23	* executable, regardless of the license terms of these independent
24	* modules, and to copy and distribute the resulting executable under
25	* terms of your choice, provided that you also meet, for each linked
26	* independent module, the terms and conditions of the license of that
27	* module. An independent module is a module which is not derived from
28	* or based on this library. If you modify this library, you may extend
29	* this exception to your version of the library, but you are not
30	* obligated to do so. If you do not wish to do so, delete this
31	* exception statement from your version.
32	*/
33
34	package org.armedbear.lisp.util;
35
36	import java.io.IOException;
37	import java.io.InputStream;
38	import java.io.PushbackInputStream;
39	import java.io.PushbackReader;
40	import java.io.Reader;
41	import java.io.StringReader;
42	import java.nio.ByteBuffer;
43	import java.nio.CharBuffer;
44	import java.nio.charset.Charset;
45	import java.nio.charset.CharsetDecoder;
46	import java.nio.charset.CharsetEncoder;
47	import java.nio.charset.CoderResult;
48	import java.nio.charset.CodingErrorAction;
49
50	import org.armedbear.lisp.Debug;
51
52	/** Class to support mid-stream change of character encoding
53	* to support setExternalFormat operation in Stream.java
54	*
55	* Note: extends PushbackReader, but only for its interface;
56	* all methods are overridden.
57	*/
58	public class DecodingReader
59	extends PushbackReader {
60
61	// dummy reader which we need to call the Pushback constructor
62	// because a null value won't work
63	private static Reader staticReader = new StringReader("");
64
65	// contains the currently buffered bytes read from the stream
66	private ByteBuffer bbuf;
67
68	// stream to read from, wrapped in a PushbackInputStream
69	private PushbackInputStream stream;
70
71	// Decoder, used for decoding characters on the input stream
72	private CharsetDecoder cd;
73
74	// Encoder, used to put characters back on the input stream when unreading
75	private CharsetEncoder ce;
76
77	public DecodingReader(InputStream stream, int size, Charset cs) {
78	super(staticReader); // pass a dummy stream value into the constructor
79
80	// we need to be able to unread the byte buffer
81	this.stream = new PushbackInputStream(stream, size);
82	this.cd = cs.newDecoder();
83	this.cd.onUnmappableCharacter(CodingErrorAction.REPLACE);
84	this.cd.onMalformedInput(CodingErrorAction.REPLACE);
85	this.ce = cs.newEncoder();
86	bbuf = ByteBuffer.allocate(size);
87	bbuf.flip(); // mark the buffer as 'needs refill'
88	}
89
90	/** Change the Charset used to decode bytes from the input stream
91	* into characters.
92	*/
93	public final void setCharset(Charset cs) {
94	this.cd = cs.newDecoder();
95	this.cd.onUnmappableCharacter(CodingErrorAction.REPLACE);
96	this.cd.onMalformedInput(CodingErrorAction.REPLACE);
97	this.ce = cs.newEncoder();
98	}
99
100	/** Get the Charset used to decode bytes from the input stream. */
101	public final Charset getCharset() {
102	return this.cd.charset();
103	}
104
105	@Override
106	public final void close() throws IOException {
107	stream.close();
108	}
109
110	@Override
111	public final void mark(int readAheadLimit) throws IOException {
112	throw new IOException("mark/reset not supported.");
113	}
114
115	@Override
116	public final boolean markSupported() {
117	return false;
118	}
119
120	@Override
121	public final boolean ready() throws IOException {
122	return stream.available() != 0 \|\| bbuf.remaining() != 0;
123	}
124
125	@Override
126	public final void reset() throws IOException {
127	throw new IOException("reset/mark not supported.");
128	}
129
130	/** Skips 'n' characters, or as many as can be read off the stream
131	* before its end.
132	*
133	* Returns the number of characters actually skipped
134	*/
135	@Override
136	public final long skip(long n) throws IOException {
137	char[] cbuf = new char[(int)Math.min(4096, n)];
138	long m = n;
139
140	while (m > 0) {
141	int r = read(cbuf, 0, (int)Math.min(cbuf.length, m));
142
143	if (r < 0)
144	return (n - m);
145
146	m += Math.min(cbuf.length, m);
147	}
148
149	return n;
150	}
151
152	/** Unread a single code point.
153	*
154	* Decomposes the code point into UTF-16 surrogate pairs
155	* and unreads them using the char[] unreader function.
156	*
157	*/
158	@Override
159	public final void unread(int c) throws IOException {
160	char[] ch = Character.toChars(c);
161	unread(ch, 0, ch.length);
162	}
163
164	/** Unread the character array into the reader.
165	*
166	* Decodes the characters in the array into bytes,
167	* allowing the encoding to be changed before reading from
168	* the stream again, using a different charset.
169	*/
170	@Override
171	public final void unread(char[] cbuf, int off, int len) throws IOException {
172
173	ByteBuffer tb = // temp buffer
174	ce.encode(CharBuffer.wrap(cbuf, off, len));
175
176	if (tb.limit() > bbuf.position()) {
177	// unread bbuf into the pushback input stream
178	// in order to free up space for the content of 'tb'
179	for (int i = bbuf.limit(); i-- > bbuf.position(); )
180	stream.unread(bbuf.get(i));
181
182	bbuf.clear();
183	ce.encode(CharBuffer.wrap(cbuf, off, len), bbuf, true);
184	bbuf.flip();
185	} else {
186	// Don't unread bbuf, since tb will fit in front of the
187	// existing data
188	int j = bbuf.position() - 1;
189	for (int i = tb.limit(); i-- > 0; j--) // two-counter loop
190	bbuf.put(j, tb.get(i));
191
192	bbuf.position(j+1);
193	}
194	}
195
196	@Override
197	public final void unread(char[] cbuf) throws IOException {
198	unread(cbuf, 0, cbuf.length);
199	}
200
201	// fill bbuf, either when empty or when forced
202	private boolean ensureBbuf(boolean force) throws IOException {
203	if (bbuf.remaining() == 0 \|\| force) {
204	bbuf.compact();
205
206	int size = stream.available();
207	if (size > bbuf.remaining() \|\| size == 0)
208	// by reading more than the available bytes when
209	// none available, block only if we need to on
210	// interactive streams
211	size = bbuf.remaining();
212
213	byte[] by = new byte[size];
214	int c = stream.read(by);
215
216	if (c < 0) {
217	bbuf.flip(); // prepare bbuf for reading
218	return false;
219	}
220
221	bbuf.put(by, 0, c);
222	bbuf.flip();
223	}
224	return true;
225	}
226
227	@Override
228	public final int read() throws IOException {
229	// read the first UTF-16 character
230	char[] ch = new char[1];
231
232	int i = read(ch, 0, 1);
233	if (i < 0)
234	return i;
235
236	// if this is not a high surrogate,
237	// it must be a character which doesn't need one
238	if (! Character.isHighSurrogate(ch[0]))
239	return ch[0];
240
241	// save the high surrogate and read the low surrogate
242	char high = ch[0];
243	i = read(ch, 0, 1);
244	if (i < 0)
245	return i;
246
247	// combine the two and return the resulting code point
248	return Character.toCodePoint(high, ch[0]);
249	}
250
251	@Override
252	public final int read(char[] cbuf, int off, int len) throws IOException {
253	CharBuffer cb = CharBuffer.wrap(cbuf, off, len);
254	return read(cb);
255	}
256
257	@Override
258	public final int read(CharBuffer cb) throws IOException {
259	int len = cb.remaining();
260	boolean notEof = true;
261	boolean forceRead = false;
262
263
264	while (cb.remaining() > 0 && notEof) {
265	int oldRemaining = cb.remaining();
266	notEof = ensureBbuf(forceRead);
267	CoderResult r = cd.decode(bbuf, cb, ! notEof);
268	if (oldRemaining == cb.remaining()
269	&& CoderResult.OVERFLOW == r) {
270	// if this happens, the decoding failed
271	// but the bufs didn't advance. Advance
272	// them manually and do manual replacing,
273	// otherwise we loop endlessly. This occurs
274	// at least when parsing latin1 files with
275	// lowercase o-umlauts in them.
276	// Note that this is at the moment copy-paste
277	// with RandomAccessCharacterFile.read()
278	cb.put('?');
279	bbuf.get();
280	}
281	forceRead = (CoderResult.UNDERFLOW == r);
282	}
283	if (cb.remaining() == len)
284	return -1;
285	else
286	return len - cb.remaining();
287	}
288
289	@Override
290	public final int read(char[] cbuf) throws IOException {
291	return read(cbuf, 0, cbuf.length);
292	}
293
294	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: