Java中字节顺序标记错误的文件读取

我正在尝试使用Java读取CSV文件。有些文件的开头可能有字节顺序标记,但不是全部。如果存在,字节顺序将与第一行的其余部分一起读取,从而导致字符串比较出现问题。

是否存在一种跳过字节顺序标记的简单方法?

回答:

/* ____________________________________________________________________________

*

* File: UnicodeBOMInputStream.java

* Author: Gregory Pakosz.

* Date: 02 - November - 2005

* ____________________________________________________________________________

*/

package com.stackoverflow.answer;

import java.io.IOException;

import java.io.InputStream;

import java.io.PushbackInputStream;

/**

* The <code>UnicodeBOMInputStream</code> class wraps any

* <code>InputStream</code> and detects the presence of any Unicode BOM

* (Byte Order Mark) at its beginning, as defined by

* <a href="http://www.faqs.org/rfcs/rfc3629.html">RFC 3629 - UTF-8, a transformation format of ISO 10646</a>

*

* <p>The

* <a href="http://www.unicode.org/unicode/faq/utf_bom.html">Unicode FAQ</a>

* defines 5 types of BOMs:<ul>

* <li><pre>00 00 FE FF = UTF-32, big-endian</pre></li>

* <li><pre>FF FE 00 00 = UTF-32, little-endian</pre></li>

* <li><pre>FE FF = UTF-16, big-endian</pre></li>

* <li><pre>FF FE = UTF-16, little-endian</pre></li>

* <li><pre>EF BB BF = UTF-8</pre></li>

* </ul></p>

*

* <p>Use the {@link #getBOM()} method to know whether a BOM has been detected

* or not.

* </p>

* <p>Use the {@link #skipBOM()} method to remove the detected BOM from the

* wrapped <code>InputStream</code> object.</p>

*/

public class UnicodeBOMInputStream extends InputStream

{

/**

* Type safe enumeration class that describes the different types of Unicode

* BOMs.

*/

public static final class BOM

{

/**

* NONE.

*/

public static final BOM NONE = new BOM(new byte[]{},"NONE");

/**

* UTF-8 BOM (EF BB BF).

*/

public static final BOM UTF_8 = new BOM(new byte[]{(byte)0xEF,

(byte)0xBB,

(byte)0xBF},

"UTF-8");

/**

* UTF-16, little-endian (FF FE).

*/

public static final BOM UTF_16_LE = new BOM(new byte[]{ (byte)0xFF,

(byte)0xFE},

"UTF-16 little-endian");

/**

* UTF-16, big-endian (FE FF).

*/

public static final BOM UTF_16_BE = new BOM(new byte[]{ (byte)0xFE,

(byte)0xFF},

"UTF-16 big-endian");

/**

* UTF-32, little-endian (FF FE 00 00).

*/

public static final BOM UTF_32_LE = new BOM(new byte[]{ (byte)0xFF,

(byte)0xFE,

(byte)0x00,

(byte)0x00},

"UTF-32 little-endian");

/**

* UTF-32, big-endian (00 00 FE FF).

*/

public static final BOM UTF_32_BE = new BOM(new byte[]{ (byte)0x00,

(byte)0x00,

(byte)0xFE,

(byte)0xFF},

"UTF-32 big-endian");

/**

* Returns a <code>String</code> representation of this <code>BOM</code>

* value.

*/

public final String toString()

{

return description;

}

/**

* Returns the bytes corresponding to this <code>BOM</code> value.

*/

public final byte[] getBytes()

{

final int length = bytes.length;

final byte[] result = new byte[length];

// Make a defensive copy

System.arraycopy(bytes,0,result,0,length);

return result;

}

private BOM(final byte bom[], final String description)

{

assert(bom != null) : "invalid BOM: null is not allowed";

assert(description != null) : "invalid description: null is not allowed";

assert(description.length() != 0) : "invalid description: empty string is not allowed";

this.bytes = bom;

this.description = description;

}

final byte bytes[];

private final String description;

} // BOM

/**

* Constructs a new <code>UnicodeBOMInputStream</code> that wraps the

* specified <code>InputStream</code>.

*

* @param inputStream an <code>InputStream</code>.

*

* @throws NullPointerException when <code>inputStream</code> is

* <code>null</code>.

* @throws IOException on reading from the specified <code>InputStream</code>

* when trying to detect the Unicode BOM.

*/

public UnicodeBOMInputStream(final InputStream inputStream) throws NullPointerException,

IOException

{

if (inputStream == null)

throw new NullPointerException("invalid input stream: null is not allowed");

in = new PushbackInputStream(inputStream,4);

final byte bom[] = new byte[4];

final int read = in.read(bom);

switch(read)

{

case 4:

if ((bom[0] == (byte)0xFF) &&

(bom[1] == (byte)0xFE) &&

(bom[2] == (byte)0x00) &&

(bom[3] == (byte)0x00))

{

this.bom = BOM.UTF_32_LE;

break;

}

else

if ((bom[0] == (byte)0x00) &&

(bom[1] == (byte)0x00) &&

(bom[2] == (byte)0xFE) &&

(bom[3] == (byte)0xFF))

{

this.bom = BOM.UTF_32_BE;

break;

}

case 3:

if ((bom[0] == (byte)0xEF) &&

(bom[1] == (byte)0xBB) &&

(bom[2] == (byte)0xBF))

{

this.bom = BOM.UTF_8;

break;

}

case 2:

if ((bom[0] == (byte)0xFF) &&

(bom[1] == (byte)0xFE))

{

this.bom = BOM.UTF_16_LE;

break;

}

else

if ((bom[0] == (byte)0xFE) &&

(bom[1] == (byte)0xFF))

{

this.bom = BOM.UTF_16_BE;

break;

}

default:

this.bom = BOM.NONE;

break;

}

if (read > 0)

in.unread(bom,0,read);

}

/**

* Returns the <code>BOM</code> that was detected in the wrapped

* <code>InputStream</code> object.

*

* @return a <code>BOM</code> value.

*/

public final BOM getBOM()

{

// BOM type is immutable.

return bom;

}

/**

* Skips the <code>BOM</code> that was found in the wrapped

* <code>InputStream</code> object.

*

* @return this <code>UnicodeBOMInputStream</code>.

*

* @throws IOException when trying to skip the BOM from the wrapped

* <code>InputStream</code> object.

*/

public final synchronized UnicodeBOMInputStream skipBOM() throws IOException

{

if (!skipped)

{

in.skip(bom.bytes.length);

skipped = true;

}

return this;

}

/**

* {@inheritDoc}

*/

public int read() throws IOException

{

return in.read();

}

/**

* {@inheritDoc}

*/

public int read(final byte b[]) throws IOException,

NullPointerException

{

return in.read(b,0,b.length);

}

/**

* {@inheritDoc}

*/

public int read(final byte b[],

final int off,

final int len) throws IOException,

NullPointerException

{

return in.read(b,off,len);

}

/**

* {@inheritDoc}

*/

public long skip(final long n) throws IOException

{

return in.skip(n);

}

/**

* {@inheritDoc}

*/

public int available() throws IOException

{

return in.available();

}

/**

* {@inheritDoc}

*/

public void close() throws IOException

{

in.close();

}

/**

* {@inheritDoc}

*/

public synchronized void mark(final int readlimit)

{

in.mark(readlimit);

}

/**

* {@inheritDoc}

*/

public synchronized void reset() throws IOException

{

in.reset();

}

/**

* {@inheritDoc}

*/

public boolean markSupported()

{

return in.markSupported();

}

private final PushbackInputStream in;

private final BOM bom;

private boolean skipped = false;

} // UnicodeBOMInputStream

你正在以这种方式使用它:

import java.io.BufferedReader;

import java.io.FileInputStream;

import java.io.InputStreamReader;

public final class UnicodeBOMInputStreamUsage

{

public static void main(final String[] args) throws Exception

{

FileInputStream fis = new FileInputStream("test/offending_bom.txt");

UnicodeBOMInputStream ubis = new UnicodeBOMInputStream(fis);

System.out.println("detected BOM: " + ubis.getBOM());

System.out.print("Reading the content of the file without skipping the BOM: ");

InputStreamReader isr = new InputStreamReader(ubis);

BufferedReader br = new BufferedReader(isr);

System.out.println(br.readLine());

br.close();

isr.close();

ubis.close();

fis.close();

fis = new FileInputStream("test/offending_bom.txt");

ubis = new UnicodeBOMInputStream(fis);

isr = new InputStreamReader(ubis);

br = new BufferedReader(isr);

ubis.skipBOM();

System.out.print("Reading the content of the file after skipping the BOM: ");

System.out.println(br.readLine());

br.close();

isr.close();

ubis.close();

fis.close();

}

} // UnicodeBOMInputStreamUsage

以上是 Java中字节顺序标记错误的文件读取 的全部内容, 来源链接: utcz.com/qa/404317.html

回到顶部