如何执行文本文件的二进制搜索

我有一个很大的文本文件(5Mb),我在我的Android应用程序中使用。我将该文件创建为预先分类的字符串列表,并且该文件一旦创建就不会更改。如何对这个文件的内容执行二分搜索,而不需要逐行读取匹配的字符串?如何执行文本文件的二进制搜索

回答:

由于文件内容没有变化,所以可以将文件分成多个部分。说A-G,H-N,0-T和U-Z。这使您可以检查第一个字符,并立即将可能的设置切割为原始尺寸的四分之一。现在线性搜索不会花费太长时间,或者读取整个文件可能是一个选项。如果n/4仍然过大,这个过程可能会延长,但这个想法是相同的。将搜索细分构建到文件结构中,而不是试图在内存中完成所有操作。

回答:

5MB文件并不是那么大 - 您应该可以将每行读入String[]数组,然后您可以使用java.util.Arrays.binarySearch()找到所需的行。这是我推荐的方法。

如果您不想将整个文件读入应用程序,那么它会变得更加复杂。如果该文件的每一行是相同的长度,并且该文件已经排序,那么你就可以打开但在RandomAccessFile中的文件,并使用seek()这样执行二进制搜索自己......

// open the file for reading 

RandomAccessFile raf = new RandomAccessFile("myfile.txt","r");

String searchValue = "myline";

int lineSize = 50;

int numberOfLines = raf.length()/lineSize;

// perform the binary search...

byte[] lineBuffer = new byte[lineSize];

int bottom = 0;

int top = numberOfLines;

int middle;

while (bottom <= top){

middle = (bottom+top)/2;

raf.seek(middle*lineSize); // jump to this line in the file

raf.read(lineBuffer); // read the line from the file

String line = new String(lineBuffer); // convert the line to a String

int comparison = line.compareTo(searchValue);

if (comparison == 0){

// found it

break;

}

else if (comparison < 0){

// line comes before searchValue

bottom = middle + 1;

}

else {

// line comes after searchValue

top = middle - 1;

}

}

raf.close(); // close the file when you're finished

,如果该文件没有固定宽度的行,那么您不能轻松地执行二进制搜索,而无需首先将其加载到内存中,因为您无法像使用固定宽度的行一样快速跳转到文件中的特定行。

回答:

在一个统一的字符长度的文本文件中,你可以寻找字符间距的中间字符,开始阅读字符,直到你打开你的分隔符,然后使用后续字符串作为元素明智中间的近似值。但是,在android中这样做的问题显然不能get random access to a resource(尽管我想你可以每次重新打开它)。此外,这种技术不会推广到其他类型的地图和集合。

另一种选择是(使用RandomAccessFile)在文件的开始处编写一个ints的“数组” - 每个String一个 - 然后返回并用它们相应的字符串的位置更新它们。再次搜索将需要跳来跳去。

我会做什么(并在我自己的应用程序中做过)是在文件中实现hash set。这是一个单独的链接与树木。

import java.io.BufferedInputStream; 

import java.io.DataInputStream;

import java.io.File;

import java.io.FileInputStream;

import java.io.IOException;

import java.io.RandomAccessFile;

import java.util.ArrayList;

import java.util.Collections;

import java.util.LinkedList;

import java.util.Set;

class StringFileSet {

private static final double loadFactor = 0.75;

public static void makeFile(String fileName, String comment, Set<String> set) throws IOException {

new File(fileName).delete();

RandomAccessFile fout = new RandomAccessFile(fileName, "rw");

//Write comment

fout.writeUTF(comment);

//Make bucket array

int numBuckets = (int)(set.size()/loadFactor);

ArrayList<ArrayList<String>> bucketArray = new ArrayList<ArrayList<String>>(numBuckets);

for (int ii = 0; ii < numBuckets; ii++){

bucketArray.add(new ArrayList<String>());

}

for (String key : set){

bucketArray.get(Math.abs(key.hashCode()%numBuckets)).add(key);

}

//Sort key lists in preparation for creating trees

for (ArrayList<String> keyList : bucketArray){

Collections.sort(keyList);

}

//Make queues in preparation for creating trees

class NodeInfo{

public final int lower;

public final int upper;

public final long callingOffset;

public NodeInfo(int lower, int upper, long callingOffset){

this.lower = lower;

this.upper = upper;

this.callingOffset = callingOffset;

}

}

ArrayList<LinkedList<NodeInfo>> queueList = new ArrayList<LinkedList<NodeInfo>>(numBuckets);

for (int ii = 0; ii < numBuckets; ii++){

queueList.add(new LinkedList<NodeInfo>());

}

//Write bucket array

fout.writeInt(numBuckets);

for (int index = 0; index < numBuckets; index++){

queueList.get(index).add(new NodeInfo(0, bucketArray.get(index).size()-1, fout.getFilePointer()));

fout.writeInt(-1);

}

//Write trees

for (int bucketIndex = 0; bucketIndex < numBuckets; bucketIndex++){

while (queueList.get(bucketIndex).size() != 0){

NodeInfo nodeInfo = queueList.get(bucketIndex).poll();

if (nodeInfo.lower <= nodeInfo.upper){

//Set respective pointer in parent node

fout.seek(nodeInfo.callingOffset);

fout.writeInt((int)(fout.length() - (nodeInfo.callingOffset + 4))); //Distance instead of absolute position so that the get method can use a DataInputStream

fout.seek(fout.length());

int middle = (nodeInfo.lower + nodeInfo.upper)/2;

//Key

fout.writeUTF(bucketArray.get(bucketIndex).get(middle));

//Left child

queueList.get(bucketIndex).add(new NodeInfo(nodeInfo.lower, middle-1, fout.getFilePointer()));

fout.writeInt(-1);

//Right child

queueList.get(bucketIndex).add(new NodeInfo(middle+1, nodeInfo.upper, fout.getFilePointer()));

fout.writeInt(-1);

}

}

}

fout.close();

}

private final String fileName;

private final int numBuckets;

private final int bucketArrayOffset;

public StringFileSet(String fileName) throws IOException {

this.fileName = fileName;

DataInputStream fin = new DataInputStream(new BufferedInputStream(new FileInputStream(fileName)));

short numBytes = fin.readShort();

fin.skipBytes(numBytes);

this.numBuckets = fin.readInt();

this.bucketArrayOffset = numBytes + 6;

fin.close();

}

public boolean contains(String key) throws IOException {

boolean containsKey = false;

DataInputStream fin = new DataInputStream(new BufferedInputStream(new FileInputStream(this.fileName)));

fin.skipBytes(4*(Math.abs(key.hashCode()%this.numBuckets)) + this.bucketArrayOffset);

int distance = fin.readInt();

while (distance != -1){

fin.skipBytes(distance);

String candidate = fin.readUTF();

if (key.compareTo(candidate) < 0){

distance = fin.readInt();

}else if (key.compareTo(candidate) > 0){

fin.skipBytes(4);

distance = fin.readInt();

}else{

fin.skipBytes(8);

containsKey = true;

break;

}

}

fin.close();

return containsKey;

}

}

测试程序

import java.io.File; 

import java.io.IOException;

import java.util.HashSet;

class Test {

public static void main(String[] args) throws IOException {

HashSet<String> stringMemorySet = new HashSet<String>();

stringMemorySet.add("red");

stringMemorySet.add("yellow");

stringMemorySet.add("blue");

StringFileSet.makeFile("stringSet", "Provided under ... included in all copies and derivatives ...", stringMemorySet);

StringFileSet stringFileSet = new StringFileSet("stringSet");

System.out.println("orange -> " + stringFileSet.contains("orange"));

System.out.println("red -> " + stringFileSet.contains("red"));

System.out.println("yellow -> " + stringFileSet.contains("yellow"));

System.out.println("blue -> " + stringFileSet.contains("blue"));

new File("stringSet").delete();

System.out.println();

}

}

您还需要pass a Context它,如果当你修改其Android系统,因此它可以访问getResources()方法。

你也可能会想要stop the android build tools from compressing the file,这显然只能做到 - 如果你正在使用GUI - 通过将文件的扩展名更改为像jpg这样的东西。这使得我的应用程序的处理速度提高了大约100到300倍。

您也可以使用NDK来查看giving yourself more memory。

回答:

这里是我快速放在一起的东西。它使用两个文件,一个包含文字,另一个包含偏移量。偏移文件的格式是这样的:第一10位包含单词大小,最后22位包含该偏移量(字位置,例如,AAAH将是0,abasementable将是4,等等)。它以大端(java标准)编码。希望它有助于某人。

word.dat:

aaahabasementableabnormalabnormalityabortionistabortion-rightsabracadabra

wordx.dat:

00 80 00 00 01 20 00 04 00 80 00 0D 01 00 00 11 _____ __________ 

01 60 00 19 01 60 00 24 01 E0 00 2F 01 60 00 3E _`___`_$___/_`_>

我在C#中创建这些文件,但这里是它的代码(它使用一个txt文件与单词由crlfs分隔)

static void Main(string[] args) 

{

const string fIn = @"C:\projects\droid\WriteFiles\input\allwords.txt";

const string fwordxOut = @"C:\projects\droid\WriteFiles\output\wordx.dat";

const string fWordOut = @"C:\projects\droid\WriteFiles\output\word.dat";

int i = 0;

int offset = 0;

int j = 0;

var lines = File.ReadLines(fIn);

FileStream stream = new FileStream(fwordxOut, FileMode.Create, FileAccess.ReadWrite);

using (EndianBinaryWriter wwordxOut = new EndianBinaryWriter(EndianBitConverter.Big, stream))

{

using (StreamWriter wWordOut = new StreamWriter(File.Open(fWordOut, FileMode.Create)))

{

foreach (var line in lines)

{

wWordOut.Write(line);

i = offset | ((int)line.Length << 22); //first 10 bits to the left is the word size

offset = offset + (int)line.Length;

wwordxOut.Write(i);

//if (j == 7)

// break;

j++;

}

}

}

}

这是二进制文件搜索Java代码:

public static void binarySearch() { 

String TAG = "TEST";

String wordFilePath = Environment.getExternalStorageDirectory().getAbsolutePath() + "/word.dat";

String wordxFilePath = Environment.getExternalStorageDirectory().getAbsolutePath() + "/wordx.dat";

String target = "abracadabra";

boolean targetFound = false;

int searchCount = 0;

try {

RandomAccessFile raf = new RandomAccessFile(wordxFilePath, "r");

RandomAccessFile rafWord = new RandomAccessFile(wordFilePath, "r");

long low = 0;

long high = (raf.length()/4) - 1;

int cur = 0;

long wordOffset = 0;

int len = 0;

while (high >= low) {

long mid = (low + high)/2;

raf.seek(mid * 4);

cur = raf.readInt();

Log.v(TAG + "-cur", String.valueOf(cur));

len = cur >> 22; //word length

cur = cur & 0x3FFFFF; //first 10 bits are 0

rafWord.seek(cur);

byte [] bytes = new byte[len];

wordOffset = rafWord.read(bytes, 0, len);

Log.v(TAG + "-wordOffset", String.valueOf(wordOffset));

searchCount++;

String str = new String(bytes);

Log.v(TAG, str);

if (target.compareTo(str) < 0) {

high = mid - 1;

} else if (target.compareTo(str) == 0) {

targetFound = true;

break;

} else {

low = mid + 1;

}

}

raf.close();

rafWord.close();

} catch (FileNotFoundException e) {

e.printStackTrace();

} catch (IOException e) {

e.printStackTrace();

}

if (targetFound == true) {

Log.v(TAG + "-found " , String.valueOf(searchCount));

} else {

Log.v(TAG + "-not found " , String.valueOf(searchCount));

}

}

回答:

虽然这听起来有点小题大做,不存储你需要作为平面文件来做到这一点的数据。建立数据库并查询数据库中的数据。这应该既有效又快速。

以上是 如何执行文本文件的二进制搜索 的全部内容, 来源链接: utcz.com/qa/257636.html

回到顶部