source: branches/1.1.x/src/org/armedbear/lisp/util/DecodingReader.java

Last change on this file was 12902, checked in by vvoutilainen, 14 years ago

Fix reading of data containing scandinavian latin1 characters
correctly, and add a simple test for it. The utf-8 test is
just a sanity test so that umlauts as utf-8 aren't broken, the
latin1 test properly fails without this patch and passes
with this patch.

  • Property svn:eol-style set to native
  • Property svn:keywords set to Id
File size: 9.4 KB
Line 
1/*
2 * DecodingStreamReader.java
3 *
4 * Copyright (C) 2010 Erik Huelsmann
5 * $Id: DecodingReader.java 12902 2010-08-28 11:09:13Z vvoutilainen $
6 *
7 * This program is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU General Public License
9 * as published by the Free Software Foundation; either version 2
10 * of the License, or (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
20 *
21 * As a special exception, the copyright holders of this library give you
22 * permission to link this library with independent modules to produce an
23 * executable, regardless of the license terms of these independent
24 * modules, and to copy and distribute the resulting executable under
25 * terms of your choice, provided that you also meet, for each linked
26 * independent module, the terms and conditions of the license of that
27 * module.  An independent module is a module which is not derived from
28 * or based on this library.  If you modify this library, you may extend
29 * this exception to your version of the library, but you are not
30 * obligated to do so.  If you do not wish to do so, delete this
31 * exception statement from your version.
32 */
33
34package org.armedbear.lisp.util;
35
36import java.io.IOException;
37import java.io.InputStream;
38import java.io.PushbackInputStream;
39import java.io.PushbackReader;
40import java.io.Reader;
41import java.io.StringReader;
42import java.nio.ByteBuffer;
43import java.nio.CharBuffer;
44import java.nio.charset.Charset;
45import java.nio.charset.CharsetDecoder;
46import java.nio.charset.CharsetEncoder;
47import java.nio.charset.CoderResult;
48import java.nio.charset.CodingErrorAction;
49
50import org.armedbear.lisp.Debug;
51
52/** Class to support mid-stream change of character encoding
53 * to support setExternalFormat operation in Stream.java
54 *
55 * Note: extends PushbackReader, but only for its interface;
56 * all methods are overridden.
57 */
58public class DecodingReader
59    extends PushbackReader {
60
61    // dummy reader which we need to call the Pushback constructor
62    // because a null value won't work
63    private static Reader staticReader = new StringReader("");
64
65    // contains the currently buffered bytes read from the stream
66    private ByteBuffer bbuf;
67
68    // stream to read from, wrapped in a PushbackInputStream
69    private PushbackInputStream stream;
70
71    // Decoder, used for decoding characters on the input stream
72    private CharsetDecoder cd;
73
74    // Encoder, used to put characters back on the input stream when unreading
75    private CharsetEncoder ce;
76
77    public DecodingReader(InputStream stream, int size, Charset cs) {
78        super(staticReader); // pass a dummy stream value into the constructor
79
80          // we need to be able to unread the byte buffer
81        this.stream = new PushbackInputStream(stream, size);
82        this.cd = cs.newDecoder();
83        this.cd.onUnmappableCharacter(CodingErrorAction.REPLACE);
84        this.cd.onMalformedInput(CodingErrorAction.REPLACE);
85        this.ce = cs.newEncoder();
86        bbuf = ByteBuffer.allocate(size);
87        bbuf.flip();  // mark the buffer as 'needs refill'
88    }
89
90    /** Change the Charset used to decode bytes from the input stream
91     * into characters.
92     */
93    public final void setCharset(Charset cs) {
94        this.cd = cs.newDecoder();
95        this.cd.onUnmappableCharacter(CodingErrorAction.REPLACE);
96        this.cd.onMalformedInput(CodingErrorAction.REPLACE);
97        this.ce = cs.newEncoder();
98    }
99
100    /** Get the Charset used to decode bytes from the input stream. */
101    public final Charset getCharset() {
102        return this.cd.charset();
103    }
104
105    @Override
106    public final void close() throws IOException {
107        stream.close();
108    }
109
110    @Override
111    public final void mark(int readAheadLimit) throws IOException {
112        throw new IOException("mark/reset not supported.");
113    }
114
115    @Override
116    public final boolean markSupported() {
117        return false;
118    }
119
120    @Override
121    public final boolean ready() throws IOException {
122        return stream.available() != 0 || bbuf.remaining() != 0;
123    }
124
125    @Override
126    public final void reset() throws IOException {
127        throw new IOException("reset/mark not supported.");
128    }
129
130    /** Skips 'n' characters, or as many as can be read off the stream
131     * before its end.
132     *
133     * Returns the number of characters actually skipped
134     */
135    @Override
136    public final long skip(long n) throws IOException {
137        char[] cbuf = new char[(int)Math.min(4096, n)];
138        long m = n;
139
140        while (m > 0) {
141            int r = read(cbuf, 0, (int)Math.min(cbuf.length, m));
142
143            if (r < 0)
144                return (n - m);
145
146            m += Math.min(cbuf.length, m);
147        }
148
149        return n;
150    }
151
152    /** Unread a single code point.
153     *
154     * Decomposes the code point into UTF-16 surrogate pairs
155     * and unreads them using the char[] unreader function.
156     *
157     */
158    @Override
159    public final void unread(int c) throws IOException {
160        char[] ch = Character.toChars(c);
161        unread(ch, 0, ch.length);
162    }
163
164    /** Unread the character array into the reader.
165     *
166     * Decodes the characters in the array into bytes,
167     * allowing the encoding to be changed before reading from
168     * the stream again, using a different charset.
169     */
170    @Override
171    public final void unread(char[] cbuf, int off, int len) throws IOException {
172
173        ByteBuffer tb = // temp buffer
174            ce.encode(CharBuffer.wrap(cbuf, off, len));
175
176        if (tb.limit() > bbuf.position()) {
177            // unread bbuf into the pushback input stream
178            // in order to free up space for the content of 'tb'
179            for (int i = bbuf.limit(); i-- > bbuf.position(); )
180                stream.unread(bbuf.get(i));
181
182            bbuf.clear();
183            ce.encode(CharBuffer.wrap(cbuf, off, len), bbuf, true);
184            bbuf.flip();
185        } else {
186            // Don't unread bbuf, since tb will fit in front of the
187            // existing data
188            int j = bbuf.position() - 1;
189            for (int i = tb.limit(); i-- > 0; j--) // two-counter loop
190                bbuf.put(j, tb.get(i));
191
192            bbuf.position(j+1);
193        }
194    }
195
196    @Override
197    public final void unread(char[] cbuf) throws IOException {
198        unread(cbuf, 0, cbuf.length);
199    }
200
201    // fill bbuf, either when empty or when forced
202    private boolean ensureBbuf(boolean force) throws IOException {
203        if (bbuf.remaining() == 0 || force) {
204            bbuf.compact();
205
206            int size = stream.available();
207            if (size > bbuf.remaining() || size == 0)
208                // by reading more than the available bytes when
209                // none available, block only if we need to on
210                // interactive streams
211                size = bbuf.remaining();
212
213            byte[] by = new byte[size];
214            int c = stream.read(by);
215
216            if (c < 0) {
217                bbuf.flip();  // prepare bbuf for reading
218                return false;
219            }
220
221            bbuf.put(by, 0, c);
222            bbuf.flip();
223        }
224        return true;
225    }
226
227    @Override
228    public final int read() throws IOException {
229        // read the first UTF-16 character
230        char[] ch = new char[1];
231
232        int i = read(ch, 0, 1);
233        if (i < 0)
234            return i;
235
236        // if this is not a high surrogate,
237        // it must be a character which doesn't need one
238        if (! Character.isHighSurrogate(ch[0]))
239            return ch[0];
240
241        // save the high surrogate and read the low surrogate
242        char high = ch[0];
243        i = read(ch, 0, 1);
244        if (i < 0)
245            return i;
246
247        // combine the two and return the resulting code point
248        return Character.toCodePoint(high, ch[0]);
249    }
250
251    @Override
252    public final int read(char[] cbuf, int off, int len) throws IOException {
253        CharBuffer cb = CharBuffer.wrap(cbuf, off, len);
254        return read(cb);
255    }
256
257    @Override
258    public final int read(CharBuffer cb) throws IOException {
259        int len = cb.remaining();
260        boolean notEof = true;
261        boolean forceRead = false;
262
263
264        while (cb.remaining() > 0 && notEof) {
265            int oldRemaining = cb.remaining();
266            notEof = ensureBbuf(forceRead);
267            CoderResult r = cd.decode(bbuf, cb, ! notEof);
268            if (oldRemaining == cb.remaining()
269                && CoderResult.OVERFLOW == r) {
270                // if this happens, the decoding failed
271                // but the bufs didn't advance. Advance
272                // them manually and do manual replacing,
273                // otherwise we loop endlessly. This occurs
274                // at least when parsing latin1 files with
275                // lowercase o-umlauts in them.
276                // Note that this is at the moment copy-paste
277                // with RandomAccessCharacterFile.read()
278                cb.put('?');
279                bbuf.get();
280            }
281            forceRead = (CoderResult.UNDERFLOW == r);
282        }
283        if (cb.remaining() == len)
284            return -1;
285        else
286            return len - cb.remaining();
287    }
288
289    @Override
290    public final int read(char[] cbuf) throws IOException {
291        return read(cbuf, 0, cbuf.length);
292    }
293
294}
Note: See TracBrowser for help on using the repository browser.