3. 数据连接封装
# 3.1 文档加载器:Document Loaders
pip install pymupdf
1
from langchain_community.document_loaders import PyMuPDFLoader
loader = PyMuPDFLoader("llama2.pdf")
pages = loader.load_and_split()
print(pages[0].page_content)
1
2
3
4
5
6
2
3
4
5
6
# 3.2 文档处理器
TextSplitter(文本分割器)
pip install --upgrade langchain-text-splitters
1
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
loader = PyMuPDFLoader("llama2.pdf")
pages = loader.load_and_split()
print(pages[0].page_content)
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=200,
chunk_overlap=100,
length_function=len,
add_start_index=True,
)
paragraphs = text_splitter.create_documents([pages[0].page_content])
for para in paragraphs:
print(para.page_content)
print('-------')
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
类似 LlamaIndex,LangChain 也提供了丰富的 Document Loaders 和 Text Splitters,由于 LangChain 提供的文档处理器不太好用,不详细介绍。
# 3.3 向量数据库与向量检索
from langchain_openai import OpenAIEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_openai import ChatOpenAI
from langchain_community.document_loaders import PyMuPDFLoader
# 加载文档
loader = PyMuPDFLoader("llama2.pdf")
pages = loader.load_and_split()
# 文档切分
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=300,
chunk_overlap=100,
length_function=len,
add_start_index=True,
)
texts = text_splitter.create_documents(
[page.page_content for page in pages[:4]]
)
# 灌库
embeddings = OpenAIEmbeddings(model="text-embedding-3-small")
db = FAISS.from_documents(texts, embeddings)
# 检索 top-3 结果
retriever = db.as_retriever(search_kwargs={"k": 3})
docs = retriever.invoke("llama2有多少参数")
for doc in docs:
print(doc.page_content)
print("----")
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
注意:
- 文档处理部分,建议在实际应用中详细测试后使用
- 与向量数据库的链接部分本质是接口封装,向量数据库需要自己选型
编辑 (opens new window)
上次更新: 2025/12/19, 15:17:48