Changeset 12902


Ignore:
Timestamp:
08/28/10 11:09:13 (13 years ago)
Author:
vvoutilainen
Message:

Fix reading of data containing scandinavian latin1 characters
correctly, and add a simple test for it. The utf-8 test is
just a sanity test so that umlauts as utf-8 aren't broken, the
latin1 test properly fails without this patch and passes
with this patch.

Location:
trunk/abcl
Files:
3 added
3 edited

Legend:

Unmodified
Added
Removed
  • trunk/abcl/abcl.asd

    r12899 r12902  
    4646                      (:file "math-tests")
    4747                      (:file "misc-tests")
     48                      (:file "latin1-tests")
    4849                      (:file "bugs" :depends-on ("file-system-tests"))
    4950                      (:file "pathname-tests")))))
  • trunk/abcl/src/org/armedbear/lisp/util/DecodingReader.java

    r12759 r12902  
    4646import java.nio.charset.CharsetEncoder;
    4747import java.nio.charset.CoderResult;
     48import java.nio.charset.CodingErrorAction;
    4849
    4950import org.armedbear.lisp.Debug;
     
    8081        this.stream = new PushbackInputStream(stream, size);
    8182        this.cd = cs.newDecoder();
     83        this.cd.onUnmappableCharacter(CodingErrorAction.REPLACE);
     84        this.cd.onMalformedInput(CodingErrorAction.REPLACE);
    8285        this.ce = cs.newEncoder();
    8386        bbuf = ByteBuffer.allocate(size);
     
    9093    public final void setCharset(Charset cs) {
    9194        this.cd = cs.newDecoder();
     95        this.cd.onUnmappableCharacter(CodingErrorAction.REPLACE);
     96        this.cd.onMalformedInput(CodingErrorAction.REPLACE);
    9297        this.ce = cs.newEncoder();
    9398    }
     
    258263
    259264        while (cb.remaining() > 0 && notEof) {
     265            int oldRemaining = cb.remaining();
    260266            notEof = ensureBbuf(forceRead);
    261267            CoderResult r = cd.decode(bbuf, cb, ! notEof);
     268            if (oldRemaining == cb.remaining()
     269                && CoderResult.OVERFLOW == r) {
     270                // if this happens, the decoding failed
     271                // but the bufs didn't advance. Advance
     272                // them manually and do manual replacing,
     273                // otherwise we loop endlessly. This occurs
     274                // at least when parsing latin1 files with
     275                // lowercase o-umlauts in them.
     276                // Note that this is at the moment copy-paste
     277                // with RandomAccessCharacterFile.read()
     278                cb.put('?');
     279                bbuf.get();
     280            }
    262281            forceRead = (CoderResult.UNDERFLOW == r);
    263 
    264             if (r.isMalformed()) {
    265                 throw new RACFMalformedInputException(bbuf.position(),
    266                                                       (char)bbuf.get(bbuf.position()),
    267                                                       cd.charset().name());
    268             } else if (r.isUnmappable()) {
    269                 // a situation exactly like this is in DecodingReader too
    270                 Debug.assertTrue(false);
    271             }
    272282        }
    273283        if (cb.remaining() == len)
  • trunk/abcl/src/org/armedbear/lisp/util/RandomAccessCharacterFile.java

    r12513 r12902  
    371371        boolean atEof = false;
    372372        while ((cbuf.remaining() > 0) && ! atEof) {
    373 
     373            int oldRemaining = cbuf.remaining();
    374374            atEof = ! ensureReadBbuf(decodeWasUnderflow);
    375375            CoderResult r = cdec.decode(bbuf, cbuf, atEof );
     376            if (oldRemaining == cbuf.remaining()
     377                && CoderResult.OVERFLOW == r) {
     378                // if this happens, the decoding failed
     379                // but the bufs didn't advance. Advance
     380                // them manually and do manual replacing,
     381                // otherwise we loop endlessly. This occurs
     382                // at least when parsing latin1 files with
     383                // lowercase o-umlauts in them
     384                // Note that this is at the moment copy-paste
     385                // with DecodingReader.read()
     386                cbuf.put('?');
     387                bbuf.get();
     388            }
    376389            decodeWasUnderflow = (CoderResult.UNDERFLOW == r);
    377             if (r.isMalformed())
    378                 // When reading encoded Unicode, we'd expect to require
    379                 // catching MalformedInput
    380                 throw new RACFMalformedInputException(bbuf.position(),
    381                                                       (char)bbuf.get(bbuf.position()),
    382                                                       cset.name());
    383             if (r.isUnmappable())
    384                 // Since we're mapping TO unicode, we'd expect to be able
    385                 // to map all characters
    386                 Debug.assertTrue(false);
    387             // OVERFLOW is a normal condition:
    388             //  it's equal to cbuf.remaining() == 0
    389             // ### EHU: really??? EXACTLY equal??
    390390        }
    391391        if (cbuf.remaining() == len) {
Note: See TracChangeset for help on using the changeset viewer.