fix some bugs

xAdsorcept · Apr 8, 2024 · b896bd4 · b896bd4
1 parent 39e580c
commit b896bd4
Show file tree

Hide file tree

Showing 16 changed files with 69 additions and 118 deletions.
diff --git a/MODEL_LICENSE b/MODEL_LICENSE
@@ -1,4 +1,4 @@
-The aiXcoder License
+The aiXcoder Model License
 
 1. Definitions
 

diff --git a/README.md b/README.md
@@ -1,5 +1,9 @@
 # aiXcoder-7B Code Large Language Model
 
+<p align="center">
+    🏠 <a href="https://www.aixcoder.com/" target="_blank">Official website</a>｜🛠 <a href="https://marketplace.visualstudio.com/items?itemName=aixcoder-plugin.aixcoder" target="_blank">VS Code Plugin</a>｜🛠 <a href="https://plugins.jetbrains.com/plugin/13574-aixcoder-code-completer" target="_blank">Jetbrains Plugin</a>｜🤗 <a href="https://huggingface.co/aiXcoder/aiXcoder-7b" target="_blank">Model Weights</a>｜<a href="" target="_blank">WeChat</a>｜<a href="./assets/wechat_2.jpg" target="_blank">WeChat Official Account</a>
+</p>
+
 Welcome to the official repository of aiXcoder-7B Code Large Language Model. This model is designed to understand and generate code across multiple programming languages, offering state-of-the-art performance in code completion, comprehension, generation, and more tasks about programming languages.
 
 Table of Contents
@@ -34,13 +38,13 @@ In our ongoing exploration to apply large code models, the release of aiXcoder 7
 However, we have plans for further development of the aiXcoder model series already in motion. In the near future, we aim to release new versions of the model that have been meticulously instruct-tuned for a wider range of programming tasks, including but not limited to test case generation and code debugging. Through these instruct-tuned models, we anticipate offering developers more comprehensive and deeper programming support, helping them to maximize efficiency at every stage of software development.
 
 ![table_1](./assets/table_1.png)
-> aiXcoder 7B surpasses mainstream models in nl2code benchmark.
+> aiXcoder 7B surpasses mainstream models in nl2code benchmark. aiXcoder-7B is an enhancement of aiXcoder-7B-Base, fine-tuned on one hundred thousand data entries similar to Evol-instruct for one epoch.
 
 <br>
 <br>
 
 ![table_3](./assets/table_3.png)
-> aiXcoder 7B surpasses mainstream models in code completion scenarios.
+> aiXcoder 7B Base surpasses mainstream models in code completion scenarios.
 
 <br>
 <br>
@@ -61,6 +65,8 @@ To run the model inference code, you'll need the following environment setup:
 Please ensure all dependencies are installed using the following command:
 
 ```bash
+conda create -n aixcoder-7b python=3.11
+conda activate aixcoder-7b
 git clone [email protected]:aixcoder-plugin/aiXcoder-7b.git
 cd aiXcoder-7b
 pip install -r requirements.txt
@@ -180,30 +186,36 @@ print(quick_sort(arr))  # [1, 2, 3, 4, 5]
 
 ```python
 
-from transformers import AutoModelForCausalLM, AutoTokenizer
-import torch
 
+import torch
+import sys
+from hf_mini.utils import input_wrapper
+from transformers import AutoModelForCausalLM, AutoTokenizer
 
 device = "cuda" # the device to load the model onto
 
 tokenizer = AutoTokenizer.from_pretrained("aiXcoder/aiXcoder-7b")
 model = AutoModelForCausalLM.from_pretrained("aiXcoder/aiXcoder-7b", torch_dtype=torch.bfloat16)
 
 
-text = """▁<AIX-SPAN-PRE>▁<AIX-SPAN-POST>
-# 测试
-arr = [3, 2, 1, 4, 5]
-print(quick_sort(arr))  # [1, 2, 3, 4, 5]▁<AIX-SPAN-MIDDLE># the file path is: test.py
-# the code file is written by Python
-# 快速排序算法"""
+text = input_wrapper(
+    # for FIM style input, code_string stands for prefix context
+    code_string="# 快速排序算法",
+    # for FIM style input, later_code stands for suffix context
+    later_code="\n# 测试\narr = [3, 2, 1, 4, 5]\nprint(quick_sort(arr))  # [1, 2, 3, 4, 5]",
+    # file_path should be a path from project to file
+    path="test.py"
+)
 
+if len(text) == 0:
+    sys.exit()
 
 inputs = tokenizer(text, return_tensors="pt", return_token_type_ids=False)
 
 inputs = inputs.to(device)
 model.to(device)
 
-outputs = model.generate(**inputs, max_new_tokens=512)
+outputs = model.generate(**inputs, max_new_tokens=256)
 print(tokenizer.decode(outputs[0], skip_special_tokens=False))
 
 
@@ -240,7 +252,7 @@ def quick_sort(arr):
 
 ## Data for aiXcoder 7B
 
-The core dataset for aiXcoder 7B comprises the programming languages commonly used in development, as well as natural languages closely related to code. The core dataset's programming languages mainly include nearly a hundred mainstream languages such as C++, Python, Java, and JavaScript, while the natural language component primarily consists of StackOverflow Q&As, technical blogs, code documentation, and computer science papers. 
+The data for aiXcoder is divided into a core dataset and an extended dataset. The core dataset comprises the programming languages commonly used in development, as well as natural languages closely related to code. The core dataset's programming languages mainly include nearly a hundred mainstream languages such as C++, Python, Java, and JavaScript, while the natural language component primarily consists of StackOverflow Q&As, technical blogs, code documentation, and computer science papers. The extended data mainly consists of filtered open-source code datasets, high-quality English natural language datasets, and high-quality Chinese natural language datasets.
 
 <!-- <br>
 <br>
@@ -327,7 +339,7 @@ Currently, the mainstream evaluation dataset for context-aware code completion i
 
 To further evaluate the code completion capabilities of large language models for code in a more fine-grained manner, aiXcoder has built an evaluation dataset that is larger in size, more diverse in the code being tested, longer in the context length of the code being tested, and closer to real-world development projects. This evaluation dataset will also be open-sourced on GitHub simultaneously. During the evaluation process, we ensure that different large language models for code use the same maximum sequence length of 16K and evaluate the generation performance in different scenarios, such as generating complete method blocks, conditional blocks, loop processing blocks, exception handling blocks, and a total of thirteen cases.
 
-Table 3 shows the average generation performance of different models in different languages. The final evaluation results are the average of all completion scenarios and evaluation samples. The aiXcoder 7B model achieves the best performance across major programming languages and various evaluation criteria, indicating that aiXcoder 7B has the best basic code completion capability among all open-source models of the same scale and is the most suitable base model for providing code completion capabilities in real-world programming scenarios.
+Table 3 shows the average generation performance of different models in different languages. The final evaluation results are the average of all completion scenarios and evaluation samples. The aiXcoder 7B Base model achieves the best performance across major programming languages and various evaluation criteria, indicating that aiXcoder 7B Base has the best basic code completion capability among all open-source models of the same scale and is the most suitable base model for providing code completion capabilities in real-world programming scenarios.
 
 ![table_3](./assets/table_3.png)
 
@@ -362,11 +374,12 @@ In Table 8, we first evaluate the generation capability of each large language m
 ## License
 
 
-This project is licensed under the [Apache-2.0](https://www.apache.org/licenses/LICENSE-2.0) License - see the LICENSE file for details. The model weights are licensed under the Model License.
+The source code in this repository is licensed under the [Apache-2.0](https://www.apache.org/licenses/LICENSE-2.0) License - see the LICENSE file for details. 
+The model weights are licensed under the [Model License](./MODEL_LICENSE) for academic research use; for commercial use, please apply by sending an email to [email protected].
 
 
 ## Acknowledgments
 
 We would like to thank all contributors to the open-source projects and datasets that made this work possible.
 
-Thank you for your interest in our Code Large Language Model. We look forward to your contributions and feedback!
+Thank you for your interest in our Code Large Language Model. We look forward to your contributions and feedback!
diff --git a/assets/table_1.png b/assets/table_1.png
diff --git a/assets/table_2.png b/assets/table_2.png
diff --git a/assets/table_3.png b/assets/table_3.png
diff --git a/assets/table_4.png b/assets/table_4.png
diff --git a/assets/table_5.png b/assets/table_5.png
diff --git a/assets/table_6.png b/assets/table_6.png
diff --git a/assets/table_7.png b/assets/table_7.png
diff --git a/assets/table_8.png b/assets/table_8.png
diff --git a/assets/wechat_2.jpg b/assets/wechat_2.jpg
diff --git a/hf_mini/utils.py b/hf_mini/utils.py
@@ -1090,7 +1090,20 @@
 
 
 import re
-def input_wrapper(code_string, later_code: str = "", path: str = ""):
+from hf_mini.filter import SensitiveInforRM
+is_security = SensitiveInforRM()
+
+def input_wrapper(code_string, later_code: str = "", path: str = "") -> str:
+
+    _sequerity = True
+    for i in [code_string, later_code, path]:
+        if not is_security.is_security(i):
+            _sequerity = False
+            break
+
+    if not _sequerity:
+        return ""
+
     extension_pattern = re.compile(r"(\.\w+)$")
     p = ""
     if isinstance(path, str) and len(path) > 0:

diff --git a/megatron_mini/utils.py b/megatron_mini/utils.py
@@ -8,7 +8,7 @@
 import datetime
 import torch
 from megatron_mini.model.module import ModelType
-
+from megatron_mini.filter import SensitiveInforRM
 
 
 from megatron_mini import (
@@ -1210,6 +1210,7 @@ def __init__(self, rank: int = 0, model_path: str = "", logger_info=True):
             self.bos_id, self.eos_id, self.pad_id, self.eot_id,
             self.prefix_id, self.middle_id, self.suffix_id
         }
+        self.is_security = SensitiveInforRM()
 
         if rank == 0 and logger_info:
             print(
@@ -1240,6 +1241,18 @@ def __encode(self, s: str, path: str = None, is_fim: bool = False) -> List[int]:
             return self.sp_model.encode(p + s)
 
     def encode(self, code_string: str, later_code: str, file_path: str) -> List[int]:
+
+        start = time.time()
+        _sequerity = True
+        for i in [code_string, later_code, file_path]:
+            if not self.is_security.is_security(i):
+                _sequerity = False
+                break
+        print(f"Done inputs checking with {(time.time()-start) * 1000:.2f}ms", flush=True)
+
+        if not _sequerity:
+            return []
+
         assert len(code_string) > 0
         if len(later_code) == 0:
             t = self.__encode(code_string, file_path, False)

diff --git a/requirements.txt b/requirements.txt
@@ -1,98 +1,4 @@
-asttokens==2.0.5
-astunparse==1.6.3
-attrs==23.1.0
-backcall==0.2.0
-beautifulsoup4==4.12.2
-boltons==23.0.0
-brotlipy==0.7.0
-certifi==2023.7.22
-cffi==1.15.1
-chardet==4.0.0
-charset-normalizer==2.0.4
-click==8.0.4
-conda==23.9.0
-conda-build==3.27.0
-conda-content-trust==0.2.0
-conda_index==0.3.0
-conda-libmamba-solver==23.7.0
-conda-package-handling==2.2.0
-conda_package_streaming==0.9.0
-cryptography==41.0.3
-decorator==5.1.1
-dnspython==2.4.2
-exceptiongroup==1.0.4
-executing==0.8.3
-expecttest==0.1.6
-filelock==3.9.0
-fsspec==2023.9.2
-gmpy2==2.1.2
-huggingface-hub==0.22.1
-hypothesis==6.87.1
-idna==3.4
-ipython==8.15.0
-jedi==0.18.1
-Jinja2==3.1.2
-jsonpatch==1.32
-jsonpointer==2.1
-libarchive-c==2.9
-libmambapy==1.4.1
-MarkupSafe==2.1.1
-matplotlib-inline==0.1.6
-mkl-fft==1.3.8
-mkl-random==1.2.4
-mkl-service==2.4.0
-more-itertools==8.12.0
-mpmath==1.3.0
-networkx==3.1
-numpy==1.26.0
-packaging==23.1
-parso==0.8.3
-pexpect==4.8.0
-pickleshare==0.7.5
-Pillow==9.4.0
-pip==23.2.1
-pkginfo==1.9.6
-pluggy==1.0.0
-prompt-toolkit==3.0.36
-psutil==5.9.0
-ptyprocess==0.7.0
-pure-eval==0.2.2
-pycosat==0.6.4
-pycparser==2.21
-Pygments==2.15.1
-pyOpenSSL==23.2.0
-PySocks==1.7.1
-python-etcd==0.4.5
-pytz==2023.3.post1
-PyYAML==6.0
-regex==2023.12.25
-requests==2.31.0
-ruamel.yaml==0.17.21
-ruamel.yaml.clib==0.2.6
-safetensors==0.4.2
-sentencepiece==0.2.0
-setuptools==68.0.0
-six==1.16.0
-some-package==0.1
-sortedcontainers==2.4.0
-soupsieve==2.5
-stack-data==0.2.0
-sympy==1.11.1
-tokenizers==0.15.2
-tomli==2.0.1
-toolz==0.12.0
-torch==2.1.0
-torchaudio==2.1.0
-torchelastic==0.2.2
-torchvision==0.16.0
-tqdm==4.65.0
-traitlets==5.7.1
-transformers==4.39.2
-triton==2.1.0
-truststore==0.8.0
-types-dataclasses==0.6.6
-typing_extensions==4.7.1
-urllib3==1.26.16
-wcwidth==0.2.5
-wheel==0.41.2
-zstandard==0.19.0
+numpy>=1.24.4
+sentencepiece>=0.2.0
+torch>=2.1.0
+transformers>=4.34.1
diff --git a/sess_huggingface.py b/sess_huggingface.py
@@ -1,11 +1,12 @@
 import torch
+import sys
 from hf_mini.utils import input_wrapper
 from transformers import AutoModelForCausalLM, AutoTokenizer
 
 device = "cuda" # the device to load the model onto
 
-tokenizer = AutoTokenizer.from_pretrained("/data2/aix3_base_7b/aiXcoder-7b")
-model = AutoModelForCausalLM.from_pretrained("/data2/aix3_base_7b/aiXcoder-7b", torch_dtype=torch.bfloat16)
+tokenizer = AutoTokenizer.from_pretrained("aiXcoder/aiXcoder-7b")
+model = AutoModelForCausalLM.from_pretrained("aiXcoder/aiXcoder-7b", torch_dtype=torch.bfloat16)
 
 
 text = input_wrapper(
@@ -14,6 +15,8 @@
     path="test.py"
 )
 
+if len(text) == 0:
+    sys.exit()
 
 inputs = tokenizer(text, return_tensors="pt", return_token_type_ids=False)
 

diff --git a/sess_megatron.py b/sess_megatron.py
@@ -253,6 +253,9 @@ def run_infer(self, code_string: str, max_new_tokens: int = 256, later_code: str
             code_string=code_string, later_code=later_code, file_path=file_path
         )
 
+        if len(tokens) == 0:
+            return self.sess.sync_obj_info("")
+
         predict_list = []
         common_len = 0
         while True: