| 1 | /* |
|---|
| 2 | * DecodingStreamReader.java |
|---|
| 3 | * |
|---|
| 4 | * Copyright (C) 2010 Erik Huelsmann |
|---|
| 5 | * $Id: DecodingReader.java 12902 2010-08-28 11:09:13Z vvoutilainen $ |
|---|
| 6 | * |
|---|
| 7 | * This program is free software; you can redistribute it and/or |
|---|
| 8 | * modify it under the terms of the GNU General Public License |
|---|
| 9 | * as published by the Free Software Foundation; either version 2 |
|---|
| 10 | * of the License, or (at your option) any later version. |
|---|
| 11 | * |
|---|
| 12 | * This program is distributed in the hope that it will be useful, |
|---|
| 13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
|---|
| 14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|---|
| 15 | * GNU General Public License for more details. |
|---|
| 16 | * |
|---|
| 17 | * You should have received a copy of the GNU General Public License |
|---|
| 18 | * along with this program; if not, write to the Free Software |
|---|
| 19 | * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. |
|---|
| 20 | * |
|---|
| 21 | * As a special exception, the copyright holders of this library give you |
|---|
| 22 | * permission to link this library with independent modules to produce an |
|---|
| 23 | * executable, regardless of the license terms of these independent |
|---|
| 24 | * modules, and to copy and distribute the resulting executable under |
|---|
| 25 | * terms of your choice, provided that you also meet, for each linked |
|---|
| 26 | * independent module, the terms and conditions of the license of that |
|---|
| 27 | * module. An independent module is a module which is not derived from |
|---|
| 28 | * or based on this library. If you modify this library, you may extend |
|---|
| 29 | * this exception to your version of the library, but you are not |
|---|
| 30 | * obligated to do so. If you do not wish to do so, delete this |
|---|
| 31 | * exception statement from your version. |
|---|
| 32 | */ |
|---|
| 33 | |
|---|
| 34 | package org.armedbear.lisp.util; |
|---|
| 35 | |
|---|
| 36 | import java.io.IOException; |
|---|
| 37 | import java.io.InputStream; |
|---|
| 38 | import java.io.PushbackInputStream; |
|---|
| 39 | import java.io.PushbackReader; |
|---|
| 40 | import java.io.Reader; |
|---|
| 41 | import java.io.StringReader; |
|---|
| 42 | import java.nio.ByteBuffer; |
|---|
| 43 | import java.nio.CharBuffer; |
|---|
| 44 | import java.nio.charset.Charset; |
|---|
| 45 | import java.nio.charset.CharsetDecoder; |
|---|
| 46 | import java.nio.charset.CharsetEncoder; |
|---|
| 47 | import java.nio.charset.CoderResult; |
|---|
| 48 | import java.nio.charset.CodingErrorAction; |
|---|
| 49 | |
|---|
| 50 | import org.armedbear.lisp.Debug; |
|---|
| 51 | |
|---|
| 52 | /** Class to support mid-stream change of character encoding |
|---|
| 53 | * to support setExternalFormat operation in Stream.java |
|---|
| 54 | * |
|---|
| 55 | * Note: extends PushbackReader, but only for its interface; |
|---|
| 56 | * all methods are overridden. |
|---|
| 57 | */ |
|---|
| 58 | public class DecodingReader |
|---|
| 59 | extends PushbackReader { |
|---|
| 60 | |
|---|
| 61 | // dummy reader which we need to call the Pushback constructor |
|---|
| 62 | // because a null value won't work |
|---|
| 63 | private static Reader staticReader = new StringReader(""); |
|---|
| 64 | |
|---|
| 65 | // contains the currently buffered bytes read from the stream |
|---|
| 66 | private ByteBuffer bbuf; |
|---|
| 67 | |
|---|
| 68 | // stream to read from, wrapped in a PushbackInputStream |
|---|
| 69 | private PushbackInputStream stream; |
|---|
| 70 | |
|---|
| 71 | // Decoder, used for decoding characters on the input stream |
|---|
| 72 | private CharsetDecoder cd; |
|---|
| 73 | |
|---|
| 74 | // Encoder, used to put characters back on the input stream when unreading |
|---|
| 75 | private CharsetEncoder ce; |
|---|
| 76 | |
|---|
| 77 | public DecodingReader(InputStream stream, int size, Charset cs) { |
|---|
| 78 | super(staticReader); // pass a dummy stream value into the constructor |
|---|
| 79 | |
|---|
| 80 | // we need to be able to unread the byte buffer |
|---|
| 81 | this.stream = new PushbackInputStream(stream, size); |
|---|
| 82 | this.cd = cs.newDecoder(); |
|---|
| 83 | this.cd.onUnmappableCharacter(CodingErrorAction.REPLACE); |
|---|
| 84 | this.cd.onMalformedInput(CodingErrorAction.REPLACE); |
|---|
| 85 | this.ce = cs.newEncoder(); |
|---|
| 86 | bbuf = ByteBuffer.allocate(size); |
|---|
| 87 | bbuf.flip(); // mark the buffer as 'needs refill' |
|---|
| 88 | } |
|---|
| 89 | |
|---|
| 90 | /** Change the Charset used to decode bytes from the input stream |
|---|
| 91 | * into characters. |
|---|
| 92 | */ |
|---|
| 93 | public final void setCharset(Charset cs) { |
|---|
| 94 | this.cd = cs.newDecoder(); |
|---|
| 95 | this.cd.onUnmappableCharacter(CodingErrorAction.REPLACE); |
|---|
| 96 | this.cd.onMalformedInput(CodingErrorAction.REPLACE); |
|---|
| 97 | this.ce = cs.newEncoder(); |
|---|
| 98 | } |
|---|
| 99 | |
|---|
| 100 | /** Get the Charset used to decode bytes from the input stream. */ |
|---|
| 101 | public final Charset getCharset() { |
|---|
| 102 | return this.cd.charset(); |
|---|
| 103 | } |
|---|
| 104 | |
|---|
| 105 | @Override |
|---|
| 106 | public final void close() throws IOException { |
|---|
| 107 | stream.close(); |
|---|
| 108 | } |
|---|
| 109 | |
|---|
| 110 | @Override |
|---|
| 111 | public final void mark(int readAheadLimit) throws IOException { |
|---|
| 112 | throw new IOException("mark/reset not supported."); |
|---|
| 113 | } |
|---|
| 114 | |
|---|
| 115 | @Override |
|---|
| 116 | public final boolean markSupported() { |
|---|
| 117 | return false; |
|---|
| 118 | } |
|---|
| 119 | |
|---|
| 120 | @Override |
|---|
| 121 | public final boolean ready() throws IOException { |
|---|
| 122 | return stream.available() != 0 || bbuf.remaining() != 0; |
|---|
| 123 | } |
|---|
| 124 | |
|---|
| 125 | @Override |
|---|
| 126 | public final void reset() throws IOException { |
|---|
| 127 | throw new IOException("reset/mark not supported."); |
|---|
| 128 | } |
|---|
| 129 | |
|---|
| 130 | /** Skips 'n' characters, or as many as can be read off the stream |
|---|
| 131 | * before its end. |
|---|
| 132 | * |
|---|
| 133 | * Returns the number of characters actually skipped |
|---|
| 134 | */ |
|---|
| 135 | @Override |
|---|
| 136 | public final long skip(long n) throws IOException { |
|---|
| 137 | char[] cbuf = new char[(int)Math.min(4096, n)]; |
|---|
| 138 | long m = n; |
|---|
| 139 | |
|---|
| 140 | while (m > 0) { |
|---|
| 141 | int r = read(cbuf, 0, (int)Math.min(cbuf.length, m)); |
|---|
| 142 | |
|---|
| 143 | if (r < 0) |
|---|
| 144 | return (n - m); |
|---|
| 145 | |
|---|
| 146 | m += Math.min(cbuf.length, m); |
|---|
| 147 | } |
|---|
| 148 | |
|---|
| 149 | return n; |
|---|
| 150 | } |
|---|
| 151 | |
|---|
| 152 | /** Unread a single code point. |
|---|
| 153 | * |
|---|
| 154 | * Decomposes the code point into UTF-16 surrogate pairs |
|---|
| 155 | * and unreads them using the char[] unreader function. |
|---|
| 156 | * |
|---|
| 157 | */ |
|---|
| 158 | @Override |
|---|
| 159 | public final void unread(int c) throws IOException { |
|---|
| 160 | char[] ch = Character.toChars(c); |
|---|
| 161 | unread(ch, 0, ch.length); |
|---|
| 162 | } |
|---|
| 163 | |
|---|
| 164 | /** Unread the character array into the reader. |
|---|
| 165 | * |
|---|
| 166 | * Decodes the characters in the array into bytes, |
|---|
| 167 | * allowing the encoding to be changed before reading from |
|---|
| 168 | * the stream again, using a different charset. |
|---|
| 169 | */ |
|---|
| 170 | @Override |
|---|
| 171 | public final void unread(char[] cbuf, int off, int len) throws IOException { |
|---|
| 172 | |
|---|
| 173 | ByteBuffer tb = // temp buffer |
|---|
| 174 | ce.encode(CharBuffer.wrap(cbuf, off, len)); |
|---|
| 175 | |
|---|
| 176 | if (tb.limit() > bbuf.position()) { |
|---|
| 177 | // unread bbuf into the pushback input stream |
|---|
| 178 | // in order to free up space for the content of 'tb' |
|---|
| 179 | for (int i = bbuf.limit(); i-- > bbuf.position(); ) |
|---|
| 180 | stream.unread(bbuf.get(i)); |
|---|
| 181 | |
|---|
| 182 | bbuf.clear(); |
|---|
| 183 | ce.encode(CharBuffer.wrap(cbuf, off, len), bbuf, true); |
|---|
| 184 | bbuf.flip(); |
|---|
| 185 | } else { |
|---|
| 186 | // Don't unread bbuf, since tb will fit in front of the |
|---|
| 187 | // existing data |
|---|
| 188 | int j = bbuf.position() - 1; |
|---|
| 189 | for (int i = tb.limit(); i-- > 0; j--) // two-counter loop |
|---|
| 190 | bbuf.put(j, tb.get(i)); |
|---|
| 191 | |
|---|
| 192 | bbuf.position(j+1); |
|---|
| 193 | } |
|---|
| 194 | } |
|---|
| 195 | |
|---|
| 196 | @Override |
|---|
| 197 | public final void unread(char[] cbuf) throws IOException { |
|---|
| 198 | unread(cbuf, 0, cbuf.length); |
|---|
| 199 | } |
|---|
| 200 | |
|---|
| 201 | // fill bbuf, either when empty or when forced |
|---|
| 202 | private boolean ensureBbuf(boolean force) throws IOException { |
|---|
| 203 | if (bbuf.remaining() == 0 || force) { |
|---|
| 204 | bbuf.compact(); |
|---|
| 205 | |
|---|
| 206 | int size = stream.available(); |
|---|
| 207 | if (size > bbuf.remaining() || size == 0) |
|---|
| 208 | // by reading more than the available bytes when |
|---|
| 209 | // none available, block only if we need to on |
|---|
| 210 | // interactive streams |
|---|
| 211 | size = bbuf.remaining(); |
|---|
| 212 | |
|---|
| 213 | byte[] by = new byte[size]; |
|---|
| 214 | int c = stream.read(by); |
|---|
| 215 | |
|---|
| 216 | if (c < 0) { |
|---|
| 217 | bbuf.flip(); // prepare bbuf for reading |
|---|
| 218 | return false; |
|---|
| 219 | } |
|---|
| 220 | |
|---|
| 221 | bbuf.put(by, 0, c); |
|---|
| 222 | bbuf.flip(); |
|---|
| 223 | } |
|---|
| 224 | return true; |
|---|
| 225 | } |
|---|
| 226 | |
|---|
| 227 | @Override |
|---|
| 228 | public final int read() throws IOException { |
|---|
| 229 | // read the first UTF-16 character |
|---|
| 230 | char[] ch = new char[1]; |
|---|
| 231 | |
|---|
| 232 | int i = read(ch, 0, 1); |
|---|
| 233 | if (i < 0) |
|---|
| 234 | return i; |
|---|
| 235 | |
|---|
| 236 | // if this is not a high surrogate, |
|---|
| 237 | // it must be a character which doesn't need one |
|---|
| 238 | if (! Character.isHighSurrogate(ch[0])) |
|---|
| 239 | return ch[0]; |
|---|
| 240 | |
|---|
| 241 | // save the high surrogate and read the low surrogate |
|---|
| 242 | char high = ch[0]; |
|---|
| 243 | i = read(ch, 0, 1); |
|---|
| 244 | if (i < 0) |
|---|
| 245 | return i; |
|---|
| 246 | |
|---|
| 247 | // combine the two and return the resulting code point |
|---|
| 248 | return Character.toCodePoint(high, ch[0]); |
|---|
| 249 | } |
|---|
| 250 | |
|---|
| 251 | @Override |
|---|
| 252 | public final int read(char[] cbuf, int off, int len) throws IOException { |
|---|
| 253 | CharBuffer cb = CharBuffer.wrap(cbuf, off, len); |
|---|
| 254 | return read(cb); |
|---|
| 255 | } |
|---|
| 256 | |
|---|
| 257 | @Override |
|---|
| 258 | public final int read(CharBuffer cb) throws IOException { |
|---|
| 259 | int len = cb.remaining(); |
|---|
| 260 | boolean notEof = true; |
|---|
| 261 | boolean forceRead = false; |
|---|
| 262 | |
|---|
| 263 | |
|---|
| 264 | while (cb.remaining() > 0 && notEof) { |
|---|
| 265 | int oldRemaining = cb.remaining(); |
|---|
| 266 | notEof = ensureBbuf(forceRead); |
|---|
| 267 | CoderResult r = cd.decode(bbuf, cb, ! notEof); |
|---|
| 268 | if (oldRemaining == cb.remaining() |
|---|
| 269 | && CoderResult.OVERFLOW == r) { |
|---|
| 270 | // if this happens, the decoding failed |
|---|
| 271 | // but the bufs didn't advance. Advance |
|---|
| 272 | // them manually and do manual replacing, |
|---|
| 273 | // otherwise we loop endlessly. This occurs |
|---|
| 274 | // at least when parsing latin1 files with |
|---|
| 275 | // lowercase o-umlauts in them. |
|---|
| 276 | // Note that this is at the moment copy-paste |
|---|
| 277 | // with RandomAccessCharacterFile.read() |
|---|
| 278 | cb.put('?'); |
|---|
| 279 | bbuf.get(); |
|---|
| 280 | } |
|---|
| 281 | forceRead = (CoderResult.UNDERFLOW == r); |
|---|
| 282 | } |
|---|
| 283 | if (cb.remaining() == len) |
|---|
| 284 | return -1; |
|---|
| 285 | else |
|---|
| 286 | return len - cb.remaining(); |
|---|
| 287 | } |
|---|
| 288 | |
|---|
| 289 | @Override |
|---|
| 290 | public final int read(char[] cbuf) throws IOException { |
|---|
| 291 | return read(cbuf, 0, cbuf.length); |
|---|
| 292 | } |
|---|
| 293 | |
|---|
| 294 | } |
|---|