001package ball.io; 002/*- 003 * ########################################################################## 004 * Utilities 005 * $Id: UnicodeReader.java 6118 2020-06-04 19:31:45Z ball $ 006 * $HeadURL: svn+ssh://svn.hcf.dev/var/spool/scm/repository.svn/ball-util/trunk/src/main/java/ball/io/UnicodeReader.java $ 007 * %% 008 * Copyright (C) 2008 - 2020 Allen D. Ball 009 * %% 010 * Licensed under the Apache License, Version 2.0 (the "License"); 011 * you may not use this file except in compliance with the License. 012 * You may obtain a copy of the License at 013 * 014 * http://www.apache.org/licenses/LICENSE-2.0 015 * 016 * Unless required by applicable law or agreed to in writing, software 017 * distributed under the License is distributed on an "AS IS" BASIS, 018 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 019 * See the License for the specific language governing permissions and 020 * limitations under the License. 021 * ########################################################################## 022 */ 023import java.io.File; 024import java.io.FileInputStream; 025import java.io.FileNotFoundException; 026import java.io.InputStream; 027import java.io.InputStreamReader; 028import java.io.LineNumberReader; 029import java.io.PushbackInputStream; 030import java.nio.charset.Charset; 031import java.util.Arrays; 032import java.util.Map; 033import java.util.Objects; 034import lombok.ToString; 035 036import static java.nio.charset.StandardCharsets.UTF_8; 037 038/** 039 * {@link java.io.BufferedReader} implementation which analyzes the 040 * underlying {@link InputStream} for byte order marks and selects the 041 * appropriate {@link Charset}. 042 * 043 * @see BOMCharsetMap 044 * 045 * @author {@link.uri mailto:ball@hcf.dev Allen D. Ball} 046 * @version $Revision: 6118 $ 047 */ 048public class UnicodeReader extends LineNumberReader { 049 private static final Charset DEFAULT = UTF_8; 050 051 /** 052 * @param file The {@link File} to open. 053 * 054 * @throws FileNotFoundException 055 * If the {@link File} is not found. 056 */ 057 public UnicodeReader(File file) throws FileNotFoundException { 058 this(new FileInputStream(file)); 059 } 060 061 /** 062 * @param in The underlying {@link InputStream}. 063 */ 064 public UnicodeReader(InputStream in) { 065 this(in instanceof CharsetDetectInputStream 066 ? ((CharsetDetectInputStream) in) 067 : new CharsetDetectInputStream(in, DEFAULT)); 068 } 069 070 private UnicodeReader(CharsetDetectInputStream in) { 071 super(new InputStreamReader(in, in.getCharset())); 072 } 073 074 @Override 075 public String toString() { return super.toString(); } 076 077 @ToString 078 private static class CharsetDetectInputStream extends PushbackInputStream { 079 private final Charset charset; 080 081 public CharsetDetectInputStream(InputStream in, Charset charset) { 082 super(in, 8); 083 084 try { 085 for (Map.Entry<byte[],Charset> entry : 086 BOMCharsetMap.INSTANCE.entrySet()) { 087 byte[] bytes = new byte[entry.getKey().length]; 088 int length = read(bytes); 089 090 if (length < 0) { 091 break; 092 } 093 094 if (bytes.length == length 095 && Arrays.equals(bytes, entry.getKey())) { 096 charset = entry.getValue(); 097 break; 098 } else { 099 if (length > 0) { 100 unread(bytes, 0, length); 101 } 102 } 103 } 104 105 this.charset = Objects.requireNonNull(charset); 106 } catch (Exception exception) { 107 throw new ExceptionInInitializerError(exception); 108 } 109 } 110 111 public Charset getCharset() { return charset; } 112 } 113}