记:基于Springboot+Lucene构建简单的文件搜索系统
文件存储系统中存放了不同类型的文件,后台通过程序提取了文件名和内容,使用Lucene对文件名和文件内容进行索引,前端对用户提供查询接口,用户提交查询关键词之后检索索引库,返回匹配文档至前端页面。
按照上图进行简单的架构设计,准备了两份测试文档,使用开源工具Tika完成信息抽取,使用Lucene构建索引,使用Html页面提供用户查询接口。核心代码如下:构建索引和查询索引的服务端代码。
/** * 根据用户输入内容搜索
*
* @param search 输入搜索内容
* @return 结果集
*/
public List<FileModel> findByTerm(String search) throws IOException {
if(Strings.isNullOrEmpty(search)){
return Lists.newArrayList();
}
List<FileModel> hitList = new ArrayList<>();
String[] fields = {"title", "content"};
ClassPathResource cpr = new ClassPathResource("indexdir");
Path path = Paths.get(cpr.getFile().toURI());
Directory dir;
try {
dir = FSDirectory.open(path);
IndexReader reader = DirectoryReader.open(dir);
IndexSearcher searcher = new IndexSearcher(reader);
Analyzer analyzer = new IKAnalyzer6x(true);
MultiFieldQueryParser parser = new MultiFieldQueryParser(fields, analyzer);
Query query = parser.parse(search);
//定制高亮
SimpleHTMLFormatter formatter = new SimpleHTMLFormatter("<span style="color:red;">", "</span>");
//标题高亮
QueryScorer scorerTitle = new QueryScorer(query, fields[0]);
Highlighter highlightTitle = new Highlighter(formatter, scorerTitle);
//内容高亮
QueryScorer scorerContent = new QueryScorer(query, fields[1]);
Highlighter highlightContent = new Highlighter(formatter, scorerContent);
TopDocs topDocs = searcher.search(query, 10);
for (ScoreDoc sd : topDocs.scoreDocs) {
Document doc = searcher.doc(sd.doc);
String title = doc.get("title");
String content = doc.get("content");
TokenStream tokenStream = TokenSources.getAnyTokenStream(searcher.getIndexReader(), sd.doc, fields[0], analyzer);
Fragmenter fragmenter = new SimpleSpanFragmenter(scorerTitle);
highlightTitle.setTextFragmenter(fragmenter);
String hlTitle = highlightTitle.getBestFragment(tokenStream, title);
//获取内容高亮片段
tokenStream = TokenSources.getTokenStream(searcher.getIndexReader(), sd.doc, fields[1], analyzer);
fragmenter = new SimpleSpanFragmenter(scorerContent);
highlightContent.setTextFragmenter(fragmenter);
String hlContent = highlightContent.getBestFragment(tokenStream, content);
FileModel fm = new FileModel(hlTitle != null ? hlTitle : title, hlContent != null ? hlContent : content);
hitList.add(fm);
}
} catch (Exception e) {
e.printStackTrace();
}
return hitList;
}
/**
* 获取提取文件集合
*
* @return
* @throws Exception
*/
public List<FileModel> extractFile() throws Exception {
ArrayList<FileModel> list = new ArrayList<>();
ClassPathResource cpr = new ClassPathResource("doc");
if (!cpr.getFile().exists()) {
return list;
}
for (File file : cpr.getFile().listFiles()) {
FileModel fm = new FileModel(file.getName(), parseExtraction(file));
list.add(fm);
}
return list;
}
/**
* 文件内容提取
*
* @param file 文件对象
* @return content:文件内容
*/
public String parseExtraction(File file) {
String content = "";
BodyContentHandler handler = new BodyContentHandler(10 * 1024 * 1024);
Parser parser = new AutoDetectParser();
Metadata metadata = new Metadata();
FileInputStream inputStream;
try {
inputStream = new FileInputStream(file);
ParseContext context = new ParseContext();
parser.parse(inputStream, handler, metadata, context);
content = handler.toString();
} catch (Exception e) {
e.printStackTrace();
}
return content;
}
/**
* 创建文件索引
*/
public void createIndex() throws Exception {
//IK分词器新建
Analyzer analyzer = new IKAnalyzer6x();
IndexWriterConfig icw = new IndexWriterConfig(analyzer);
icw.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
Resource resource = new ClassPathResource("indexdir");
FieldType fieldType = new FieldType();
fieldType.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS);
fieldType.setStored(true);
fieldType.setTokenized(true);
fieldType.setStoreTermVectors(true);
fieldType.setStoreTermVectorPositions(true);
fieldType.setStoreTermVectorOffsets(true);
Date now = new Date();
Directory dir = FSDirectory.open(resource.getFile().toPath());
IndexWriter indexWriter = new IndexWriter(dir, icw);
List<FileModel> fileModelList = extractFile();
for (FileModel fm : fileModelList) {
Document doc = new Document();
doc.add(new Field("title", fm.getTitle(), fieldType));
doc.add(new Field("content", fm.getContent(), fieldType));
indexWriter.addDocument(doc);
}
indexWriter.close();
Date end = new Date();
System.out.println("索引文档完成,共耗时:" + (end.getTime() - now.getTime()) + "毫秒");
}
<!DOCTYPE html><html lang="en" xmlns:th="http://www.thymeleaf.org" xmlns:v-on="http://www.w3.org/1999/xhtml">
<head>
<meta charset="UTF-8">
<title>首页</title>
<link th:href="@{/css/index.css}" rel="stylesheet" type="text/css">
<script th:src="@{/js/vue.min.js}"></script>
<script th:src="@{/js/axios.min.js}"></script>
</head>
<body>
<div id="app">
<div id="search" class="search-input">
<div class="si"><input ref="input_content" class="text-input" type="text"/></div>
<div class="sb">
<button class="btn-search" type="button" v-on:click="search">
<svg t="1587265580322" class="icon" viewBox="0 0 1024 1024" version="1.1"
xmlns="http://www.w3.org/2000/svg" p-id="1914" width="28" height="28">
<path d="M712.5 645.3l161.2 161.2c26.3 26.3 32.7 63.1 14.2 81.6-18.5 18.5-55.3 12.2-81.6-14.2L645.1 712.7"
fill="#FFEABB" p-id="1915"></path>
<path
d="M859.7 928.6c-4.9 0-9.8-0.4-14.9-1.3-21.5-3.7-42.7-15.2-59.8-32.3L623.9 733.9l42.4-42.4 161.2 161.2c16.3 16.3 34.9 18.5 39.2 14.2 4.3-4.3 2.1-22.9-14.2-39.2L691.3 666.5l42.4-42.4 161.2 161.2c17.1 17.1 28.5 38.3 32.3 59.8 4.3 25-2.2 48.4-18.1 64.2-12.7 12.6-30.1 19.3-49.4 19.3z"
fill="#F9C73D" p-id="1916"></path>
<path
d="M443.3 806.5c-49 0-96.6-9.6-141.4-28.6-43.3-18.3-82.1-44.5-115.4-77.8-33.3-33.3-59.5-72.2-77.8-115.4C89.6 539.8 80 492.3 80 443.3s9.6-96.6 28.6-141.4c18.3-43.3 44.5-82.1 77.8-115.4s72.2-59.5 115.4-77.8C346.7 89.6 394.2 80 443.3 80s96.6 9.6 141.4 28.6c43.3 18.3 82.1 44.5 115.4 77.8 33.3 33.3 59.5 72.2 77.8 115.4 18.9 44.8 28.6 92.4 28.6 141.4s-9.6 96.6-28.6 141.4c-18.3 43.3-44.5 82.1-77.8 115.4-33.3 33.3-72.2 59.5-115.4 77.8-44.9 19.1-92.4 28.7-141.4 28.7z m0-666.5C276 140 140 276 140 443.3c0 167.2 136 303.2 303.2 303.2s303.2-136 303.2-303.2C746.5 276 610.5 140 443.3 140z"
fill="#F9C73D" p-id="1917"></path>
</svg>
</button>
</div>
</div>
<div class="result-list">
<ul>
<li v-for="item in resultList">
<p class="p_title" v-html="item.title">{{item.title}}</p>
<p class="p_content" v-html="item.content">{{ item.content }}</p>
</li>
</ul>
</div>
</div>
<script type="text/javascript" th:src="@{/js/index.js}"></script>
</body>
</html>
查询结果示例图如下:
本次学习主要是把之前的知识串起来,Lucene索引构建,索引查询,搜索词解析,多域查询,搜索词高亮,包括好久没写的vue.js和h+c知识。
以上是 记:基于Springboot+Lucene构建简单的文件搜索系统 的全部内容, 来源链接: utcz.com/z/515628.html