001package ball.io;
002/*-
003 * ##########################################################################
004 * Utilities
005 * $Id: UnicodeReader.java 6118 2020-06-04 19:31:45Z ball $
006 * $HeadURL: svn+ssh://svn.hcf.dev/var/spool/scm/repository.svn/ball-util/trunk/src/main/java/ball/io/UnicodeReader.java $
007 * %%
008 * Copyright (C) 2008 - 2020 Allen D. Ball
009 * %%
010 * Licensed under the Apache License, Version 2.0 (the "License");
011 * you may not use this file except in compliance with the License.
012 * You may obtain a copy of the License at
013 *
014 *      http://www.apache.org/licenses/LICENSE-2.0
015 *
016 * Unless required by applicable law or agreed to in writing, software
017 * distributed under the License is distributed on an "AS IS" BASIS,
018 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
019 * See the License for the specific language governing permissions and
020 * limitations under the License.
021 * ##########################################################################
022 */
023import java.io.File;
024import java.io.FileInputStream;
025import java.io.FileNotFoundException;
026import java.io.InputStream;
027import java.io.InputStreamReader;
028import java.io.LineNumberReader;
029import java.io.PushbackInputStream;
030import java.nio.charset.Charset;
031import java.util.Arrays;
032import java.util.Map;
033import java.util.Objects;
034import lombok.ToString;
035
036import static java.nio.charset.StandardCharsets.UTF_8;
037
038/**
039 * {@link java.io.BufferedReader} implementation which analyzes the
040 * underlying {@link InputStream} for byte order marks and selects the
041 * appropriate {@link Charset}.
042 *
043 * @see BOMCharsetMap
044 *
045 * @author {@link.uri mailto:ball@hcf.dev Allen D. Ball}
046 * @version $Revision: 6118 $
047 */
048public class UnicodeReader extends LineNumberReader {
049    private static final Charset DEFAULT = UTF_8;
050
051    /**
052     * @param   file            The {@link File} to open.
053     *
054     * @throws  FileNotFoundException
055     *                          If the {@link File} is not found.
056     */
057    public UnicodeReader(File file) throws FileNotFoundException {
058        this(new FileInputStream(file));
059    }
060
061    /**
062     * @param   in              The underlying {@link InputStream}.
063     */
064    public UnicodeReader(InputStream in) {
065        this(in instanceof CharsetDetectInputStream
066                 ? ((CharsetDetectInputStream) in)
067                 : new CharsetDetectInputStream(in, DEFAULT));
068    }
069
070    private UnicodeReader(CharsetDetectInputStream in) {
071        super(new InputStreamReader(in, in.getCharset()));
072    }
073
074    @Override
075    public String toString() { return super.toString(); }
076
077    @ToString
078    private static class CharsetDetectInputStream extends PushbackInputStream {
079        private final Charset charset;
080
081        public CharsetDetectInputStream(InputStream in, Charset charset) {
082            super(in, 8);
083
084            try {
085                for (Map.Entry<byte[],Charset> entry :
086                         BOMCharsetMap.INSTANCE.entrySet()) {
087                    byte[] bytes = new byte[entry.getKey().length];
088                    int length = read(bytes);
089
090                    if (length < 0) {
091                        break;
092                    }
093
094                    if (bytes.length == length
095                        && Arrays.equals(bytes, entry.getKey())) {
096                        charset = entry.getValue();
097                        break;
098                    } else {
099                        if (length > 0) {
100                            unread(bytes, 0, length);
101                        }
102                    }
103                }
104
105                this.charset = Objects.requireNonNull(charset);
106            } catch (Exception exception) {
107                throw new ExceptionInInitializerError(exception);
108            }
109        }
110
111        public Charset getCharset() { return charset; }
112    }
113}