记:基于Springboot+Lucene构建简单的文件搜索系统

编程

文件存储系统中存放了不同类型的文件,后台通过程序提取了文件名和内容,使用Lucene对文件名和文件内容进行索引,前端对用户提供查询接口,用户提交查询关键词之后检索索引库,返回匹配文档至前端页面。

按照上图进行简单的架构设计,准备了两份测试文档,使用开源工具Tika完成信息抽取,使用Lucene构建索引,使用Html页面提供用户查询接口。核心代码如下:构建索引和查询索引的服务端代码。

/**

* 根据用户输入内容搜索

*

* @param search 输入搜索内容

* @return 结果集

*/

public List<FileModel> findByTerm(String search) throws IOException {

if(Strings.isNullOrEmpty(search)){

return Lists.newArrayList();

}

List<FileModel> hitList = new ArrayList<>();

String[] fields = {"title", "content"};

ClassPathResource cpr = new ClassPathResource("indexdir");

Path path = Paths.get(cpr.getFile().toURI());

Directory dir;

try {

dir = FSDirectory.open(path);

IndexReader reader = DirectoryReader.open(dir);

IndexSearcher searcher = new IndexSearcher(reader);

Analyzer analyzer = new IKAnalyzer6x(true);

MultiFieldQueryParser parser = new MultiFieldQueryParser(fields, analyzer);

Query query = parser.parse(search);

//定制高亮

SimpleHTMLFormatter formatter = new SimpleHTMLFormatter("<span style="color:red;">", "</span>");

//标题高亮

QueryScorer scorerTitle = new QueryScorer(query, fields[0]);

Highlighter highlightTitle = new Highlighter(formatter, scorerTitle);

//内容高亮

QueryScorer scorerContent = new QueryScorer(query, fields[1]);

Highlighter highlightContent = new Highlighter(formatter, scorerContent);

TopDocs topDocs = searcher.search(query, 10);

for (ScoreDoc sd : topDocs.scoreDocs) {

Document doc = searcher.doc(sd.doc);

String title = doc.get("title");

String content = doc.get("content");

TokenStream tokenStream = TokenSources.getAnyTokenStream(searcher.getIndexReader(), sd.doc, fields[0], analyzer);

Fragmenter fragmenter = new SimpleSpanFragmenter(scorerTitle);

highlightTitle.setTextFragmenter(fragmenter);

String hlTitle = highlightTitle.getBestFragment(tokenStream, title);

//获取内容高亮片段

tokenStream = TokenSources.getTokenStream(searcher.getIndexReader(), sd.doc, fields[1], analyzer);

fragmenter = new SimpleSpanFragmenter(scorerContent);

highlightContent.setTextFragmenter(fragmenter);

String hlContent = highlightContent.getBestFragment(tokenStream, content);

FileModel fm = new FileModel(hlTitle != null ? hlTitle : title, hlContent != null ? hlContent : content);

hitList.add(fm);

}

} catch (Exception e) {

e.printStackTrace();

}

return hitList;

}

/**

* 获取提取文件集合

*

* @return

* @throws Exception

*/

public List<FileModel> extractFile() throws Exception {

ArrayList<FileModel> list = new ArrayList<>();

ClassPathResource cpr = new ClassPathResource("doc");

if (!cpr.getFile().exists()) {

return list;

}

for (File file : cpr.getFile().listFiles()) {

FileModel fm = new FileModel(file.getName(), parseExtraction(file));

list.add(fm);

}

return list;

}

/**

* 文件内容提取

*

* @param file 文件对象

* @return content:文件内容

*/

public String parseExtraction(File file) {

String content = "";

BodyContentHandler handler = new BodyContentHandler(10 * 1024 * 1024);

Parser parser = new AutoDetectParser();

Metadata metadata = new Metadata();

FileInputStream inputStream;

try {

inputStream = new FileInputStream(file);

ParseContext context = new ParseContext();

parser.parse(inputStream, handler, metadata, context);

content = handler.toString();

} catch (Exception e) {

e.printStackTrace();

}

return content;

}

/**

* 创建文件索引

*/

public void createIndex() throws Exception {

//IK分词器新建

Analyzer analyzer = new IKAnalyzer6x();

IndexWriterConfig icw = new IndexWriterConfig(analyzer);

icw.setOpenMode(IndexWriterConfig.OpenMode.CREATE);

Resource resource = new ClassPathResource("indexdir");

FieldType fieldType = new FieldType();

fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);

fieldType.setStored(true);

fieldType.setTokenized(true);

fieldType.setStoreTermVectors(true);

fieldType.setStoreTermVectorPositions(true);

fieldType.setStoreTermVectorOffsets(true);

Date now = new Date();

Directory dir = FSDirectory.open(resource.getFile().toPath());

IndexWriter indexWriter = new IndexWriter(dir, icw);

List<FileModel> fileModelList = extractFile();

for (FileModel fm : fileModelList) {

Document doc = new Document();

doc.add(new Field("title", fm.getTitle(), fieldType));

doc.add(new Field("content", fm.getContent(), fieldType));

indexWriter.addDocument(doc);

}

indexWriter.close();

Date end = new Date();

System.out.println("索引文档完成,共耗时:" + (end.getTime() - now.getTime()) + "毫秒");

}

<!DOCTYPE html>

<html lang="en" xmlns:th="http://www.thymeleaf.org" xmlns:v-on="http://www.w3.org/1999/xhtml">

<head>

<meta charset="UTF-8">

<title>首页</title>

<link th:href="@{/css/index.css}" rel="stylesheet" type="text/css">

<script th:src="@{/js/vue.min.js}"></script>

<script th:src="@{/js/axios.min.js}"></script>

</head>

<body>

<div id="app">

<div id="search" class="search-input">

<div class="si"><input ref="input_content" class="text-input" type="text"/></div>

<div class="sb">

<button class="btn-search" type="button" v-on:click="search">

<svg t="1587265580322" class="icon" viewBox="0 0 1024 1024" version="1.1"

xmlns="http://www.w3.org/2000/svg" p-id="1914" width="28" height="28">

<path d="M712.5 645.3l161.2 161.2c26.3 26.3 32.7 63.1 14.2 81.6-18.5 18.5-55.3 12.2-81.6-14.2L645.1 712.7"

fill="#FFEABB" p-id="1915"></path>

<path

d="M859.7 928.6c-4.9 0-9.8-0.4-14.9-1.3-21.5-3.7-42.7-15.2-59.8-32.3L623.9 733.9l42.4-42.4 161.2 161.2c16.3 16.3 34.9 18.5 39.2 14.2 4.3-4.3 2.1-22.9-14.2-39.2L691.3 666.5l42.4-42.4 161.2 161.2c17.1 17.1 28.5 38.3 32.3 59.8 4.3 25-2.2 48.4-18.1 64.2-12.7 12.6-30.1 19.3-49.4 19.3z"

fill="#F9C73D" p-id="1916"></path>

<path

d="M443.3 806.5c-49 0-96.6-9.6-141.4-28.6-43.3-18.3-82.1-44.5-115.4-77.8-33.3-33.3-59.5-72.2-77.8-115.4C89.6 539.8 80 492.3 80 443.3s9.6-96.6 28.6-141.4c18.3-43.3 44.5-82.1 77.8-115.4s72.2-59.5 115.4-77.8C346.7 89.6 394.2 80 443.3 80s96.6 9.6 141.4 28.6c43.3 18.3 82.1 44.5 115.4 77.8 33.3 33.3 59.5 72.2 77.8 115.4 18.9 44.8 28.6 92.4 28.6 141.4s-9.6 96.6-28.6 141.4c-18.3 43.3-44.5 82.1-77.8 115.4-33.3 33.3-72.2 59.5-115.4 77.8-44.9 19.1-92.4 28.7-141.4 28.7z m0-666.5C276 140 140 276 140 443.3c0 167.2 136 303.2 303.2 303.2s303.2-136 303.2-303.2C746.5 276 610.5 140 443.3 140z"

fill="#F9C73D" p-id="1917"></path>

</svg>

</button>

</div>

</div>

<div class="result-list">

<ul>

<li v-for="item in resultList">

<p class="p_title" v-html="item.title">{{item.title}}</p>

<p class="p_content" v-html="item.content">{{ item.content }}</p>

</li>

</ul>

</div>

</div>

<script type="text/javascript" th:src="@{/js/index.js}"></script>

</body>

</html>

查询结果示例图如下:

本次学习主要是把之前的知识串起来,Lucene索引构建,索引查询,搜索词解析,多域查询,搜索词高亮,包括好久没写的vue.js和h+c知识。

以上是 记:基于Springboot+Lucene构建简单的文件搜索系统 的全部内容, 来源链接: utcz.com/z/515628.html

回到顶部