{ "cells": [ { "cell_type": "code", "execution_count": 2, "id": "d3b19540", "metadata": {}, "outputs": [], "source": [ "from langchain.embeddings import HuggingFaceEmbeddings\n", "from langchain.vectorstores import Chroma\n", "from langchain.text_splitter import RecursiveCharacterTextSplitter\n", "from langchain.document_loaders import TextLoader\n", "from langchain.schema import Document\n", "import os\n", "import shutil\n" ] }, { "cell_type": "code", "execution_count": 4, "id": "8b80cda9", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/tmp/ipykernel_2195063/2581603812.py:3: LangChainDeprecationWarning: The class `HuggingFaceEmbeddings` was deprecated in LangChain 0.2.2 and will be removed in 1.0. An updated version of the class exists in the :class:`~langchain-huggingface package and should be used instead. To use it run `pip install -U :class:`~langchain-huggingface` and import as `from :class:`~langchain_huggingface import HuggingFaceEmbeddings``.\n", " embeddings = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-MiniLM-L6-v2\")\n", "/home/bjtuzky/.conda/envs/rag/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n" ] } ], "source": [ "\n", "CHROMA_PATH=\"chroma_1000_50_all-MiniLM-L6-v2\"\n", "# embeddings = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-mpnet-base-v2\")\n", "embeddings = HuggingFaceEmbeddings(model_name=\"sentence-transformers/all-MiniLM-L6-v2\")\n", "\n" ] }, { "cell_type": "code", "execution_count": 7, "id": "dac8394b", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Split 3 documents into 9617 chunks.\n", "(十二)诈骗罪\n", "\n", "1.构成诈骗罪的,根据下列情形在相应的幅度内确定量刑起点:\n", "\n", "(1)达到数额较大起点的,在一年以下有期徒刑、拘役幅度内确定量刑起点。\n", "\n", "(2)达到数额巨大起点或者有其他严重情节的,在三年至四年有期徒刑幅度内确定量刑起点。\n", "\n", "(3)达到数额特别巨大起点或者有其他特别严重情节的,在十年至十二年有期徒刑幅度内确定量刑起点。依法应当判处无期徒刑的除外。\n", "\n", "2.在量刑起点的基础上,根据诈骗数额等其他影响犯罪构成的犯罪事实增加刑罚量,确定基准刑。\n", "\n", "3.构成诈骗罪的,根据诈骗的数额、手段、危害后果等犯罪情节,综合考虑被告人缴纳罚金的能力,决定罚金数额。\n", "\n", "4.构成诈骗罪的,综合考虑诈骗的起因、手段、数额、危害后果、退赃退赔等犯罪事实、量刑情节,以及被告人的主观恶性、人身危险性、认罪悔罪表现等因素,决定缓刑的适用。对实施电信网络诈骗的,从严把握缓刑的适用。\n", "\n", "(十三)抢夺罪\n", "\n", "1.构成抢夺罪的,根据下列情形在相应的幅度内确定量刑起点:\n", "\n", "(1)达到数额较大起点或者二年内三次抢夺的,在一年以下有期徒刑、拘役幅度内确定量刑起点。\n", "\n", "(2)达到数额巨大起点或者有其他严重情节的,在三年至五年有期徒刑幅度内确定量刑起点。\n", "\n", "(3)达到数额特别巨大起点或者有其他特别严重情节的,在十年至十二年有期徒刑幅度内确定量刑起点。依法应当判处无期徒刑的除外。\n", "\n", "2.在量刑起点的基础上,根据抢夺数额、次数等其他影响犯罪构成的犯罪事实增加刑罚量,确定基准刑。\n", "\n", "多次抢夺,数额达到较大以上的,以抢夺数额确定量刑起点,抢夺次数可以作为调节基准刑的量刑情节;数额未达到较大的,以抢夺次数确定量刑起点,超过三次的次数作为增加刑罚量的事实。\n", "\n", "3.构成抢夺罪的,根据抢夺的数额、次数、手段、危害后果等犯罪情节,综合考虑被告人缴纳罚金的能力,决定罚金数额。\n", "\n", "4.构成抢夺罪的,综合考虑抢夺的起因、数额、手段、次数、危害后果、退赃退赔等犯罪事实、量刑情节,以及被告人的主观恶性、人身危险性、认罪悔罪表现等因素,决定缓刑的适用。\n", "\n", "(十四)职务侵占罪\n", "\n", "1.构成职务侵占罪的,根据下列情形在相应的幅度内确定量刑起点:\n", "\n", "(1)达到数额较大起点的,在一年以下有期徒刑、拘役幅度内确定量刑起点。\n", "\n", "(2)达到数额巨大起点的,在三年至四年有期徒刑幅度内确定量刑起点。\n", "{'source': './term.txt', 'start_index': 9377}\n" ] } ], "source": [ "\n", "# 加载文档\n", "loader = TextLoader('term.txt')\n", "\n", "\n", "def load_documents():\n", " # loader = TextLoader(DATA_PATH)\n", " # documents = loader.load()\n", " # return documents\n", " loaders = [TextLoader('./term.txt', encoding='utf-8'),TextLoader('./corpus.txt', encoding='utf-8'),TextLoader('./law_explanation.txt', encoding='utf-8')]\n", " docs = []\n", " for loader in loaders:\n", " pages = loader.load()\n", " docs.extend(pages)\n", " return docs\n", "\n", "\n", "def split_text(documents: list[Document]):\n", " text_splitter = RecursiveCharacterTextSplitter(\n", " chunk_size=1000,\n", " chunk_overlap=50,\n", " length_function=len,\n", " add_start_index=True,\n", " )\n", " chunks = text_splitter.split_documents(documents)\n", " print(f\"Split {len(documents)} documents into {len(chunks)} chunks.\")\n", "\n", " document = chunks[10]\n", " print(document.page_content)\n", " print(document.metadata)\n", "\n", " return chunks\n", "\n", "documents = load_documents()\n", "chunks = split_text(documents)\n", "\n", "# 加载嵌入模型\n", "\n", "\n", "if os.path.exists(CHROMA_PATH):\n", " shutil.rmtree(CHROMA_PATH)\n", "# 创建数据库\n", "db = Chroma.from_documents(chunks, embeddings,persist_directory=CHROMA_PATH)\n", "\n", "# 保存数据库到本地\n", "db.persist()\n" ] }, { "cell_type": "code", "execution_count": 3, "id": "cdea672a", "metadata": {}, "outputs": [ { "ename": "NameError", "evalue": "name 'CHROMA_PATH' is not defined", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)", "Cell \u001b[0;32mIn[3], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m db \u001b[38;5;241m=\u001b[39m Chroma(persist_directory\u001b[38;5;241m=\u001b[39m\u001b[43mCHROMA_PATH\u001b[49m, embedding_function\u001b[38;5;241m=\u001b[39membeddings)\n\u001b[1;32m 3\u001b[0m query \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\"\"\u001b[39m\u001b[38;5;124m最高人民法院关于处理自首和立功\u001b[39m\u001b[38;5;124m\"\"\"\u001b[39m\n\u001b[1;32m 4\u001b[0m results \u001b[38;5;241m=\u001b[39m db\u001b[38;5;241m.\u001b[39msimilarity_search_with_relevance_scores(query, k\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m10\u001b[39m)\n", "\u001b[0;31mNameError\u001b[0m: name 'CHROMA_PATH' is not defined" ] } ], "source": [ "\n", "db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embeddings)\n", "\n", "query = \"\"\"最高人民法院关于处理自首和立功\"\"\"\n", "results = db.similarity_search_with_relevance_scores(query, k=10)\n", "\n", "# 打印检索结果\n", "cnt=0\n", "for doc, score in results:\n", " # if query not in doc.page_content:\n", " # continue\n", " print(f\"Content: {doc.page_content}\")\n", " print(f\"Relevance Score: {score}\")\n", " print(\"-----\")\n", " cnt+=1\n", " if cnt==10:\n", " break\n" ] }, { "cell_type": "markdown", "id": "ad9ec63a", "metadata": {}, "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.0" } }, "nbformat": 4, "nbformat_minor": 5 }