Add retrieval.rst

li-xiu-qi · Mar 8, 2024 · 6640472 · 6640472
1 parent 98f2d25
commit 6640472
Showing 1 changed file with 252 additions and 0 deletions.
diff --git a/docs/source/applications/retrieval.rst b/docs/source/applications/retrieval.rst
@@ -0,0 +1,252 @@
+Using Langchain to Retrieval
+==========================
+
+This guide helps you to build a question-answering application based 
+on a local knowledge base using ``Qwen1.5-7B-Chat`` with ``langchain``.
+The goal is to establish a knowledge base Q&A solution that is friendly 
+to many scenarios and open-source models, and that can run offline.
+
+Basic Usage
+-----------
+
+You can just use your document with ``langchain`` to build a question-answering application.
+The implementation process of this project includes 
+loading files -> reading text -> segmenting text -> vectorizing text -> vectorizing questions 
+-> matching the top k most similar text vectors with the question vectors -> 
+incorporating the matched text as context along with the question into the prompt -> 
+submitting to the Qwen1.5-7B-Chat to generate an answer.
+Below is an example:
+
+.. code:: python
+
+   from transformers import AutoModelForCausalLM, AutoTokenizer
+   from abc import ABC
+   from langchain.llms.base import LLM
+   from typing import Any, List, Mapping, Optional
+   from langchain.callbacks.manager import CallbackManagerForLLMRun
+   device = "cuda" # the device to load the model onto
+
+   # Now you do not need to add "trust_remote_code=True"
+   model = AutoModelForCausalLM.from_pretrained(
+       "Qwen/Qwen1.5-7B-Chat",
+       torch_dtype="auto",
+       device_map="auto"
+   )
+   tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen1.5-7B-Chat")
+
+   class Qwen(LLM, ABC):
+        max_token: int = 10000
+        temperature: float = 0.01
+        top_p = 0.9
+        history_len: int = 3
+
+        def __init__(self):
+            super().__init__()
+
+        @property
+        def _llm_type(self) -> str:
+            return "Qwen"
+
+        @property
+        def _history_len(self) -> int:
+            return self.history_len
+
+        def set_history_len(self, history_len: int = 10) -> None:
+            self.history_len = history_len
+
+        def _call(
+            self,
+            prompt: str,
+            stop: Optional[List[str]] = None,
+            run_manager: Optional[CallbackManagerForLLMRun] = None,
+        ) -> str:
+            messages = [
+                {"role": "system", "content": "You are a helpful assistant."},
+                {"role": "user", "content": prompt}
+            ]
+            text = tokenizer.apply_chat_template(
+                messages,
+                tokenize=False,
+                add_generation_prompt=True
+            )
+            model_inputs = tokenizer([text], return_tensors="pt").to(device)
+            generated_ids = model.generate(
+                model_inputs.input_ids,
+                max_new_tokens=512
+            )
+            generated_ids = [
+                output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
+            ]
+
+            response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
+            return response
+        
+        @property
+        def _identifying_params(self) -> Mapping[str, Any]:
+            """Get the identifying parameters."""
+            return {"max_token": self.max_token,
+                    "temperature": self.temperature,
+                    "top_p": self.top_p,
+                    "history_len": self.history_len}
+
+After load the Qwen1.5-7B-Chat model, you should specify txt file 
+that needs retrieval for knowledge-based Q&A.
+
+.. code:: python
+
+    import os
+    import torch
+    import argparse
+    from langchain.vectorstores import FAISS
+    from langchain.embeddings.huggingface import HuggingFaceEmbeddings
+    from typing import List, Tuple
+    import numpy as np
+    from langchain.document_loaders import TextLoader
+    from chinese_text_splitter import ChineseTextSplitter
+    from langchain.docstore.document import Document
+    from langchain.prompts.prompt import PromptTemplate
+    from langchain.chains import RetrievalQA
+
+
+    def load_file(filepath, sentence_size=100):
+        loader = TextLoader(filepath, autodetect_encoding=True)
+        textsplitter = ChineseTextSplitter(pdf=False, sentence_size=sentence_size)
+        docs = loader.load_and_split(textsplitter)
+        write_check_file(filepath, docs)
+        return docs
+
+
+    def write_check_file(filepath, docs):
+        folder_path = os.path.join(os.path.dirname(filepath), "tmp_files")
+        if not os.path.exists(folder_path):
+            os.makedirs(folder_path)
+        fp = os.path.join(folder_path, 'load_file.txt')
+        with open(fp, 'a+', encoding='utf-8') as fout:
+            fout.write("filepath=%s,len=%s" % (filepath, len(docs)))
+            fout.write('\n')
+            for i in docs:
+                fout.write(str(i))
+                fout.write('\n')
+            fout.close()
+
+            
+    def seperate_list(ls: List[int]) -> List[List[int]]:
+        lists = []
+        ls1 = [ls[0]]
+        for i in range(1, len(ls)):
+            if ls[i - 1] + 1 == ls[i]:
+                ls1.append(ls[i])
+            else:
+                lists.append(ls1)
+                ls1 = [ls[i]]
+        lists.append(ls1)
+        return lists
+
+
+    class FAISSWrapper(FAISS):
+        chunk_size = 250
+        chunk_conent = True
+        score_threshold = 0
+        
+        def similarity_search_with_score_by_vector(
+                self, embedding: List[float], k: int = 4
+        ) -> List[Tuple[Document, float]]:
+            scores, indices = self.index.search(np.array([embedding], dtype=np.float32), k)
+            docs = []
+            id_set = set()
+            store_len = len(self.index_to_docstore_id)
+            for j, i in enumerate(indices[0]):
+                if i == -1 or 0 < self.score_threshold < scores[0][j]:
+                    # This happens when not enough docs are returned.
+                    continue
+                _id = self.index_to_docstore_id[i]
+                doc = self.docstore.search(_id)
+                if not self.chunk_conent:
+                    if not isinstance(doc, Document):
+                        raise ValueError(f"Could not find document for id {_id}, got {doc}")
+                    doc.metadata["score"] = int(scores[0][j])
+                    docs.append(doc)
+                    continue
+                id_set.add(i)
+                docs_len = len(doc.page_content)
+                for k in range(1, max(i, store_len - i)):
+                    break_flag = False
+                    for l in [i + k, i - k]:
+                        if 0 <= l < len(self.index_to_docstore_id):
+                            _id0 = self.index_to_docstore_id[l]
+                            doc0 = self.docstore.search(_id0)
+                            if docs_len + len(doc0.page_content) > self.chunk_size:
+                                break_flag = True
+                                break
+                            elif doc0.metadata["source"] == doc.metadata["source"]:
+                                docs_len += len(doc0.page_content)
+                                id_set.add(l)
+                    if break_flag:
+                        break
+            if not self.chunk_conent:
+                return docs
+            if len(id_set) == 0 and self.score_threshold > 0:
+                return []
+            id_list = sorted(list(id_set))
+            id_lists = seperate_list(id_list)
+            for id_seq in id_lists:
+                for id in id_seq:
+                    if id == id_seq[0]:
+                        _id = self.index_to_docstore_id[id]
+                        doc = self.docstore.search(_id)
+                    else:
+                        _id0 = self.index_to_docstore_id[id]
+                        doc0 = self.docstore.search(_id0)
+                        doc.page_content += " " + doc0.page_content
+                if not isinstance(doc, Document):
+                    raise ValueError(f"Could not find document for id {_id}, got {doc}")
+                doc_score = min([scores[0][id] for id in [indices[0].tolist().index(i) for i in id_seq if i in indices[0]]])
+                doc.metadata["score"] = int(doc_score)
+                docs.append((doc, doc_score))
+            return docs
+
+
+    if __name__ == '__main__':
+        # load docs (pdf file or txt file)
+        filepath = 'your file path'
+        # Embedding model name
+        EMBEDDING_MODEL = 'text2vec'
+        PROMPT_TEMPLATE = """Known information:
+        {context_str} 
+        Based on the above known information, respond to the user's question concisely and professionally. If an answer cannot be derived from it, say 'The question cannot be answered with the given information' or 'Not enough relevant information has been provided,' and do not include fabricated details in the answer. Please respond in English. The question is {question}"""
+        # Embedding running device
+        EMBEDDING_DEVICE = "cuda"
+        # return top-k text chunk from vector store
+        VECTOR_SEARCH_TOP_K = 3
+        SENTENCE_SIZE = 50
+        CHAIN_TYPE = 'stuff'
+        embedding_model_dict = {
+            "text2vec": "your text2vec model path",
+        }
+        llm = QWen()
+        embeddings = HuggingFaceEmbeddings(model_name=embedding_model_dict[EMBEDDING_MODEL],model_kwargs={'device': EMBEDDING_DEVICE})
+        
+        docs = load_file(filepath, sentence_size=SENTENCE_SIZE)
+        
+        docsearch = FAISSWrapper.from_documents(docs, embeddings)
+        
+        prompt = PromptTemplate(
+            template=PROMPT_TEMPLATE, input_variables=["context_str", "question"]
+        )
+
+        chain_type_kwargs = {"prompt": prompt, "document_variable_name": "context_str"}
+        qa = RetrievalQA.from_chain_type(
+            llm=llm,
+            chain_type=CHAIN_TYPE, 
+            retriever=docsearch.as_retriever(search_kwargs={"k": VECTOR_SEARCH_TOP_K}), 
+            chain_type_kwargs=chain_type_kwargs)
+        
+        query = "Give me a short introduction to large language model."  
+        print(qa.run(query))
+
+Next Step
+---------
+
+Now you can chat with Qwen1.5 use your own document. Continue
+to read the documentation and try to figure out more advanced usages of
+model retrieval!