4 Commits

Author SHA1 Message Date
  lishangjing2024 99b93ed168 更新目录树 1 week ago
  YuXuan Yang b8d5351603 !2 feat(RAG): Add legal article extraction module (Regex+LLM) 1 week ago
  ShangjingLee ee691b9e41 迁移香橙派环境配置脚本 2 weeks ago
  ShangjingLee 7be8289416 限制gradio版本,更换下载逻辑 2 weeks ago
19 changed files with 1729 additions and 11 deletions
Split View
  1. +4
    -1
      .gitignore
  2. +59
    -1
      README.md
  3. +2
    -4
      chatlaw/dataloader/downloads.py
  4. +85
    -0
      develop/RAG/batch_extract_all_laws.py
  5. +105
    -0
      develop/RAG/extract_law_pure_regex.py
  6. +351
    -0
      develop/RAG/extract_regex_llm_repair.py
  7. +441
    -0
      develop/RAG/extract_sliding_window.py
  8. +25
    -0
      develop/RAG/requirements.txt
  9. +290
    -0
      develop/RAG/verify_extraction.py
  10. +144
    -0
      develop/README.md
  11. +1
    -2
      requirements.txt
  12. +1
    -1
      scripts/orangepi/310b/CANN_installer.sh
  13. +1
    -0
      scripts/orangepi/310b/aclinit.json
  14. +25
    -1
      scripts/orangepi/310b/mindspore_installer.sh
  15. +3
    -1
      scripts/orangepi/310b/preparation.sh
  16. +42
    -0
      scripts/orangepi/310p/CANN_installer.sh
  17. +51
    -0
      scripts/orangepi/310p/mindspore_installer.sh
  18. +46
    -0
      scripts/orangepi/310p/preparation.sh
  19. +53
    -0
      scripts/orangepi/README.md

+ 4
- 1
.gitignore View File

@@ -6,4 +6,7 @@ resources/
__pycache__/
openi_resource.version
chatlaw.egg-info/
build/
build/

# 开发环境资源目录
develop/resources/

+ 59
- 1
README.md View File

@@ -72,7 +72,65 @@ ___
## 软件架构
### 1.项目目录(随项目开发更新)
```

ChatLaw/
├── chatlaw
│   ├── clear_files.py
│   ├── client
│   │   ├── client_ms.py
│   │   ├── client_pt.py
│   │   ├── __init__.py
│   │   └── utils
│   │   ├── common_utils.py
│   │   ├── __init__.py
│   │   ├── utils_ms.py
│   │   └── utils_pt.py
│   ├── configuration.py
│   ├── config.yaml
│   ├── dataloader
│   │   ├── downloads.py
│   │   └── __init__.py
│   ├── __init__.py
│   └── server
│   ├── dataloader
│   │   └── __init__.py
│   ├── __init__.py
│   ├── server_ms.py
│   └── server_pt.py
├── develop
│   ├── RAG
│   │   ├── batch_extract_all_laws.py
│   │   ├── extract_law_pure_regex.py
│   │   ├── extract_regex_llm_repair.py
│   │   ├── extract_sliding_window.py
│   │   ├── requirements.txt
│   │   └── verify_extraction.py
│   └── README.md
├── launcher.py
├── LICENSE
├── MANIFEST.in
├── README.md
├── requirements.txt
├── scripts
│   ├── dhcp_linux_client.sh
│   ├── dhcp_linux_server.sh
│   ├── dhcp_mac_client.sh
│   ├── orangepi
│   │   ├── 310b
│   │   │   ├── aclinit.json
│   │   │   ├── CANN_installer.sh
│   │   │   ├── mindspore_installer.sh
│   │   │   └── preparation.sh
│   │   ├── 310p
│   │   │   ├── CANN_installer.sh
│   │   │   ├── mindspore_installer.sh
│   │   │   └── preparation.sh
│   │   └── README.md
│   ├── push_to_all_repositories
│   ├── pylint_check.sh
│   ├── static_ip_linux_client.sh
│   ├── static_ip_mac_client.sh
│   └── static_ip_server.sh
└── setup.py
```
### 2.项目结构图
<p align="center">


+ 2
- 4
chatlaw/dataloader/downloads.py View File

@@ -9,11 +9,9 @@ def download_resources(resource_type):
if resource_type == "tokenizer":
openi.openi_download_file("enter/QwenTokenizer", repo_type="dataset" , local_dir=get_resources_path(), max_workers=10)
elif resource_type == "llm":
# openi_download_file("enter/QwenModel", repo_type="model", local_dir=os.path.join(get_resources_path(), "llm"), max_workers=10)
openi.download_model(repo_id="enter/ChatLaw",
model_name="QwenModel", save_path=os.path.join(get_resources_path(), "llm"))
openi.openi_download_file("enter/QwenModel", repo_type="model", local_dir=os.path.join(get_resources_path(), "llm"), max_workers=10)
elif resource_type == "video_model":
openi.openi_download_file("enter/QwenModel", repo_type="dataset", local_dir=get_resources_path(), max_workers=10)
openi.openi_download_file("enter/VoskModel", repo_type="dataset", local_dir=get_resources_path(), max_workers=10)


__all__ = ["download_resources"]

+ 85
- 0
develop/RAG/batch_extract_all_laws.py View File

@@ -0,0 +1,85 @@
"""
批量提取所有法律PDF的条文
"""
import os
import subprocess
from pathlib import Path
from tqdm import tqdm

# 配置(请根据实际情况修改)
SCRIPT_DIR = Path(__file__).parent
PDF_DIR = r".\Legal Documents" # PDF 文件目录
OUTPUT_DIR = str(SCRIPT_DIR / "output") # 输出目录(相对路径)
SCRIPT_PATH = str(SCRIPT_DIR / "extract_regex_llm_repair.py")
PYTHON_PATH = "python" # 使用当前环境的 python

def main():
# 确保输出目录存在
os.makedirs(OUTPUT_DIR, exist_ok=True)
# 获取所有PDF文件
pdf_files = list(Path(PDF_DIR).glob("*.pdf"))
print(f"找到 {len(pdf_files)} 个PDF文件")
# 记录处理结果
success = []
failed = []
for pdf_file in tqdm(pdf_files, desc="处理进度"):
# 构造输出文件名(保持原名)
output_name = pdf_file.stem + ".json"
output_path = os.path.join(OUTPUT_DIR, output_name)
# 检查是否已经处理过
if os.path.exists(output_path):
print(f"\n跳过(已存在): {pdf_file.name}")
success.append(pdf_file.name)
continue
print(f"\n\n{'='*60}")
print(f"正在处理: {pdf_file.name}")
print(f"{'='*60}")
# 构造命令
cmd = [
PYTHON_PATH,
SCRIPT_PATH,
"-p", str(pdf_file),
"-o", output_path
]
try:
# 运行提取脚本
result = subprocess.run(
cmd,
cwd=Path(SCRIPT_PATH).parent,
capture_output=False,
text=True
)
if result.returncode == 0:
success.append(pdf_file.name)
print(f"✅ 成功: {pdf_file.name}")
else:
failed.append((pdf_file.name, f"退出码: {result.returncode}"))
print(f"❌ 失败: {pdf_file.name}")
except Exception as e:
failed.append((pdf_file.name, str(e)))
print(f"❌ 异常: {pdf_file.name} - {e}")
# 打印总结
print("\n" + "="*60)
print("处理完成汇总")
print("="*60)
print(f"✅ 成功: {len(success)} 个")
print(f"❌ 失败: {len(failed)} 个")
if failed:
print("\n失败列表:")
for name, reason in failed:
print(f" - {name}: {reason}")
print(f"\n输出目录: {OUTPUT_DIR}")

if __name__ == "__main__":
main()

+ 105
- 0
develop/RAG/extract_law_pure_regex.py View File

@@ -0,0 +1,105 @@
"""
法律条文提取脚本(纯正则版)

功能:使用正则精准切分法律条文,不依赖 LLM,速度极快

用法:python extract_law_pure_regex.py -p <PDF文件路径>
"""

import os
import json
import re
import argparse
from pathlib import Path
from typing import List, Dict

import fitz # PyMuPDF


def extract_text_from_pdf(pdf_path: str) -> str:
"""从 PDF 提取全文"""
doc = fitz.open(pdf_path)
text_parts = []
for page in doc:
text_parts.append(page.get_text())
doc.close()
return "\n".join(text_parts)


def clean_content(text: str) -> str:
"""清洗条文内容"""
text = re.sub(r'\n+', ' ', text)
text = re.sub(r' +', ' ', text)
return text.strip()


def split_by_articles(text: str, law_name: str) -> List[Dict]:
"""使用正则切分法律条文"""
# 严格匹配 "第X条",排除"章"、"节"、"编"
pattern = re.compile(r'(?:\n|^)\s*(第\s*[一二三四五六七八九十百千零0-9\s]+条)(?![一二三四五六七八九十])')

matches = list(pattern.finditer(text))
articles = []

for i, match in enumerate(matches):
start = match.start()
article_id = re.sub(r'\s+', '', match.group(1))

if i < len(matches) - 1:
end = matches[i + 1].start()
else:
end = len(text)

raw_content = text[start:end].strip()
# 去掉开头的条文编号
content = re.sub(r'^第[一二三四五六七八九十百千零0-9\s]+条\s*', '', raw_content)
content = clean_content(content)

articles.append({
"article_number": article_id,
"content": content,
"title": f"{law_name}{article_id}" if law_name else ""
})

return articles


def main():
parser = argparse.ArgumentParser(description="法律条文提取(纯正则版)")
parser.add_argument("--pdf", "-p", required=True, help="PDF 文件路径")
parser.add_argument("--output", "-o", help="输出 JSON 路径")
args = parser.parse_args()

if not os.path.exists(args.pdf):
print(f"错误: 文件不存在 {args.pdf}")
return

# 从文件名提取法律名称
pdf_name = Path(args.pdf).stem
law_name = re.sub(r'_\d+$', '', pdf_name)

print(f"正在读取 PDF: {args.pdf}")
text = extract_text_from_pdf(args.pdf)

print("正在切分条文...")
articles = split_by_articles(text, law_name)
print(f"共提取 {len(articles)} 条")

# 保存
if args.output is None:
output_dir = Path(__file__).parent / "resources" / "extracted_pure"
output_dir.mkdir(parents=True, exist_ok=True)
args.output = str(output_dir / f"{pdf_name}_struct.json")

with open(args.output, 'w', encoding='utf-8') as f:
json.dump({
"source": args.pdf,
"total": len(articles),
"articles": articles
}, f, ensure_ascii=False, indent=2)

print(f"已保存至: {args.output}")


if __name__ == "__main__":
main()

+ 351
- 0
develop/RAG/extract_regex_llm_repair.py View File

@@ -0,0 +1,351 @@
"""
法律条文提取脚本(正则切分 + LLM 结构化版)

功能:
1. 使用正则快速切分法律条文
2. 使用 Qwen2.5-7B-Instruct-AWQ 进行结构化清洗
3. 支持 Batch 处理,跑满 GPU

优势:
- 速度极快(无需滑动窗口)
- 准确率高(正则定位边界)
- 结构化好(LLM 专注清洗)
"""

import os
import json
import re
import argparse
import time
from pathlib import Path
from typing import List, Dict

import fitz # PyMuPDF
import torch
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer

# ==================== 配置 ====================

# 模型配置
# 普通版 + 4bit 量化加载(相对路径,基于脚本所在目录)
MODEL_NAME = str(Path(__file__).parent / "resources" / "Qwen2.5-7B-Instruct")

# 生成配置
BATCH_SIZE = 8 # 4bit动态量化显存占用稍高,先用8,如果显存有空余再加到16
MAX_NEW_TOKENS = 1024 # 增加长度防止截断

# ==================== 文本处理 ====================

def extract_text_from_pdf(pdf_path: str) -> str:
"""从 PDF 提取全文"""
doc = fitz.open(pdf_path)
text_parts = []
for page in doc:
text_parts.append(page.get_text())
doc.close()
return "\n".join(text_parts)

def split_by_articles(text: str) -> List[Dict]:
"""
使用正则切分法律条文
返回: [{"raw_text": "第一条...", "article_id": "第一条", "start": 0, "end": 100}, ...]
"""
# 匹配 "第X条"(严格模式)
pattern = re.compile(r'(?:\n|^)\s*(第\s*[一二三四五六七八九十百千零0-9\s]+条)(?![一二三四五六七八九十])')

matches = list(pattern.finditer(text))
articles = []

for i, match in enumerate(matches):
start = match.start()
article_id = re.sub(r'\s+', '', match.group(1))

if i < len(matches) - 1:
end = matches[i+1].start()
else:
end = len(text)

raw_content = text[start:end].strip()

articles.append({
"id": article_id,
"raw_text": raw_content,
"start": start,
"end": end
})

return articles

# ==================== LLM 处理 ====================

def load_model(model_path: str):
print(f"正在加载模型: {model_path}")
try:
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)

# 使用 bitsandbytes 4bit 量化加载
from transformers import BitsAndBytesConfig
quantization_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_compute_dtype=torch.float16,
bnb_4bit_quant_type="nf4"
)

model = AutoModelForCausalLM.from_pretrained(
model_path,
device_map="auto",
trust_remote_code=True,
quantization_config=quantization_config
)
print("模型加载完成 (4-bit 量化)")
return model, tokenizer
except Exception as e:
print(f"模型加载失败: {e}")
return None, None

def process_batch(model, tokenizer, batch_articles: List[Dict]) -> List[Dict]:
"""批量处理条文结构化"""

prompts = []
for art in batch_articles:
raw = art['raw_text']
# 构造 Prompt
prompt = f"""你是一个法律助手。请将以下法律条文文本转换为 JSON 格式。
要求:
1. 提取 "article_number"(如"第一条")
2. 提取 "content"(正文内容,去除换行符和多余空格)
3. 提取 "title"(如果条文第一句显然是标题/定义,则提取,否则为空字符串)
4. 直接输出 JSON,不要 Markdown 标记。

待处理文本:
{raw}

JSON 输出:"""

messages = [
{"role": "system", "content": "你是一个精确的法律条文解析器。"},
{"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
prompts.append(text)

# 批量编码
inputs = tokenizer(prompts, return_tensors="pt", padding=True, padding_side="left").to(model.device)

# 批量生成
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=MAX_NEW_TOKENS,
do_sample=False,
pad_token_id=tokenizer.eos_token_id
)

# 解码
# 只取生成部分
input_len = inputs.input_ids.shape[1]
generated_ids = outputs[:, input_len:]
responses = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)

results = []
for i, response in enumerate(responses):
try:
# 尝试解析 JSON
# 简单清理
json_str = response.strip()
if json_str.startswith("```json"):
json_str = json_str[7:-3]
elif json_str.startswith("```"):
json_str = json_str[3:-3]

data = json.loads(json_str)
# 如果 LLM 返回的是列表,取第一个元素
if isinstance(data, list):
if len(data) > 0:
data = data[0]
else:
raise ValueError("Empty list returned")
results.append(data)
except:
# 解析失败,回退到原始文本
results.append({
"article_number": batch_articles[i]['id'],
"content": batch_articles[i]['raw_text'],
"title": "",
"error": "json_parse_fail"
})

return results


def flatten_articles(articles: List) -> List[Dict]:
"""展平可能嵌套的 articles 列表"""
result = []
for item in articles:
if isinstance(item, dict):
result.append(item)
elif isinstance(item, list):
# 递归展平
result.extend(flatten_articles(item))
return result


def is_incomplete(article: Dict) -> bool:
"""检测条目是否不完整"""
# article_number 为空或不符合格式
num = article.get('article_number', '')
if not num or not re.match(r'^第[一二三四五六七八九十百千零0-9]+条$', num):
return True
# content 过短(可能截断)
content = article.get('content', '')
if len(content) < 10:
return True
return False


def repair_single(model, tokenizer, full_text: str, raw_article: Dict, window_size: int = 2000) -> Dict:
"""
用大窗口重新提取单条条文
"""
# 获取原位置,扩展上下文
start = raw_article.get('start', 0)
end = raw_article.get('end', len(full_text))

# 扩展窗口
ctx_start = max(0, start - window_size // 2)
ctx_end = min(len(full_text), end + window_size // 2)
context = full_text[ctx_start:ctx_end]

article_id = raw_article['id']

prompt = f"""你是一个法律助手。请从以下文本中提取条文 "{article_id}" 的完整内容。

要求:
1. 提取 "article_number"(即"{article_id}")
2. 提取 "content"(该条的完整正文,去除换行符和多余空格)
3. 提取 "title"(如果条文有标题则提取,否则为空字符串)
4. 只输出这一条的 JSON,不要其他内容

文本上下文:
{context}

JSON 输出:"""

messages = [
{"role": "system", "content": "你是一个精确的法律条文解析器。"},
{"role": "user", "content": prompt}
]
text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

inputs = tokenizer(text, return_tensors="pt").to(model.device)

with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=MAX_NEW_TOKENS,
do_sample=False,
pad_token_id=tokenizer.eos_token_id
)

input_len = inputs.input_ids.shape[1]
generated_ids = outputs[:, input_len:]
response = tokenizer.decode(generated_ids[0], skip_special_tokens=True)

try:
json_str = response.strip()
if json_str.startswith("```json"):
json_str = json_str[7:-3]
elif json_str.startswith("```"):
json_str = json_str[3:-3]
data = json.loads(json_str)
data['repaired'] = True
return data
except:
return {
"article_number": article_id,
"content": raw_article['raw_text'],
"title": "",
"repair_failed": True
}

# ==================== 主程序 ====================

def main():
parser = argparse.ArgumentParser(description="正则+LLM 法律条文提取")
parser.add_argument("--pdf", "-p", required=True, help="PDF文件路径")
parser.add_argument("--output", "-o", default=None, help="输出文件路径")
parser.add_argument("--model", "-m", default=MODEL_NAME, help="模型路径")
parser.add_argument("--batch", "-b", type=int, default=BATCH_SIZE, help="Batch Size")

args = parser.parse_args()

# 1. 提取文本
print(f"正在读取 PDF: {args.pdf}")
full_text = extract_text_from_pdf(args.pdf)

# 2. 正则切分
print("正在进行正则切分...")
raw_articles = split_by_articles(full_text)
print(f"共切分出 {len(raw_articles)} 条条文")

if not raw_articles:
print("未找到条文,请检查文件内容或正则匹配规则。")
return

# 3. 加载模型
if not os.path.exists(args.model):
print(f"错误: 模型路径不存在 {args.model}")
print("请先下载 Qwen2.5-7B-Instruct-AWQ")
return

model, tokenizer = load_model(args.model)
if not model:
return

# 4. 批量处理(第一轮)
print("正在进行 LLM 结构化处理(第一轮)...")
final_articles = []

for i in tqdm(range(0, len(raw_articles), args.batch), desc="第一轮"):
batch = raw_articles[i : i + args.batch]
processed = process_batch(model, tokenizer, batch)
final_articles.extend(processed)

# 展平可能的嵌套列表
final_articles = flatten_articles(final_articles)

# 5. 检测不完整条目并修复(第二轮)
incomplete_indices = [i for i, art in enumerate(final_articles) if is_incomplete(art)]

if incomplete_indices:
print(f"\n检测到 {len(incomplete_indices)} 条不完整,启动大窗口修复...")
for idx in tqdm(incomplete_indices, desc="修复中"):
repaired = repair_single(model, tokenizer, full_text, raw_articles[idx])
final_articles[idx] = repaired
print(f"修复完成")
else:
print("所有条目完整,无需修复")

# 6. 保存结果
if args.output is None:
pdf_name = Path(args.pdf).stem
# 修改保存路径为 resources/extracted_v2
output_dir = Path(__file__).parent / "resources" / "extracted_v2"
output_dir.mkdir(parents=True, exist_ok=True)
args.output = str(output_dir / f"{pdf_name}_struct.json")
with open(args.output, 'w', encoding='utf-8') as f:
json.dump({
"source": args.pdf,
"total": len(final_articles),
"articles": final_articles
}, f, ensure_ascii=False, indent=2)
print(f"处理完成,结果已保存至: {args.output}")

if __name__ == "__main__":
main()

+ 441
- 0
develop/RAG/extract_sliding_window.py View File

@@ -0,0 +1,441 @@
"""
法律条文提取脚本

功能:使用滑动窗口 + LLM 从 PDF 文档中提取法律条文,以 JSON 格式保存
模型:Qwen2.5-3B-Instruct(约3B参数,显存友好)
"""

import os
import json
import re
import argparse
from pathlib import Path
from typing import Generator

import fitz # PyMuPDF
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer


# ==================== 配置 ====================

# 模型配置
# 模型路径(相对路径,基于脚本所在目录)
MODEL_NAME = str(Path(__file__).parent / "resources" / "Qwen2.5-3B-Instruct")

# 滑动窗口配置
WINDOW_SIZE = 2000 # 窗口大小(字符数)
OVERLAP_SIZE = 500 # 重叠大小(字符数)

# 生成配置
MAX_NEW_TOKENS = 4096 # 最大生成 token 数
TEMPERATURE = 0.1 # 低温度,更确定性的输出


# ==================== PDF 解析 ====================

def extract_text_from_pdf(pdf_path: str) -> str:
"""
从 PDF 文件中提取全部文本
Args:
pdf_path: PDF 文件路径
Returns:
提取的文本内容
"""
doc = fitz.open(pdf_path)
text_parts = []
for page_num in range(len(doc)):
page = doc[page_num]
text = page.get_text()
text_parts.append(text)
doc.close()
# 合并所有页面文本,清理多余空白
full_text = "\n".join(text_parts)
# 清理多余的空行
full_text = re.sub(r'\n{3,}', '\n\n', full_text)
return full_text


# ==================== 滑动窗口 ====================

def sliding_window(text: str, window_size: int = WINDOW_SIZE,
overlap: int = OVERLAP_SIZE) -> Generator[tuple[int, str], None, None]:
"""
滑动窗口生成器
Args:
text: 输入文本
window_size: 窗口大小
overlap: 重叠大小
Yields:
(窗口索引, 窗口文本)
"""
step = window_size - overlap
start = 0
window_idx = 0
while start < len(text):
end = min(start + window_size, len(text))
window_text = text[start:end]
yield window_idx, window_text
window_idx += 1
start += step
# 如果已经到达末尾,退出
if end == len(text):
break


# ==================== LLM 提取 ====================

def load_model(model_name: str = MODEL_NAME, device: str = "auto"):
"""
加载模型和分词器
Args:
model_name: 模型名称或路径
device: 设备,"auto" 自动选择
Returns:
(model, tokenizer)
"""
print(f"正在加载模型: {model_name}")
tokenizer = AutoTokenizer.from_pretrained(
model_name,
trust_remote_code=True
)
model = AutoModelForCausalLM.from_pretrained(
model_name,
torch_dtype="auto",
device_map=device,
trust_remote_code=True
)
print("模型加载完成")
return model, tokenizer


def extract_articles_from_window(model, tokenizer, window_text: str) -> list[dict]:
"""
使用 LLM 从窗口文本中提取法律条文
Args:
model: 语言模型
tokenizer: 分词器
window_text: 窗口文本
Returns:
提取的条文列表,每个条文是一个字典
"""
# 构造提示词
system_prompt = """你是一个专业的法律文本分析助手。你的任务是从给定的文本中提取完整的法律条文。

提取规则:
1. 识别以"第X条"开头的法律条文
2. 提取条文的完整内容,包括所有款项
3. 如果条文不完整(被截断),标记为不完整
4. 严格按照原文提取,不要修改或总结

输出格式要求(JSON数组):
[
{
"article_number": "第一条",
"title": "条文标题(如果有)",
"content": "条文完整内容",
"is_complete": true
}
]

如果文本中没有法律条文,返回空数组 []"""

user_prompt = f"""请从以下文本中提取所有法律条文:

---
{window_text}
---

请以JSON格式输出提取结果:"""

# 构造消息
messages = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": user_prompt}
]
# 使用 chat template
text = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
# 编码输入
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
# 生成
generated_ids = model.generate(
**model_inputs,
max_new_tokens=MAX_NEW_TOKENS,
temperature=TEMPERATURE,
do_sample=True,
pad_token_id=tokenizer.eos_token_id
)
# 解码输出(只取新生成的部分)
generated_ids = [
output_ids[len(input_ids):]
for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
# 解析 JSON
articles = parse_json_response(response)
return articles


def parse_json_response(response: str) -> list[dict]:
"""
解析 LLM 输出的 JSON 响应
Args:
response: LLM 的原始输出
Returns:
解析后的条文列表
"""
# 尝试直接解析
try:
return json.loads(response)
except json.JSONDecodeError:
pass
# 尝试提取 JSON 数组
json_pattern = r'\[[\s\S]*?\]'
matches = re.findall(json_pattern, response)
for match in matches:
try:
result = json.loads(match)
if isinstance(result, list):
return result
except json.JSONDecodeError:
continue
# 尝试提取 ```json 代码块
code_block_pattern = r'```(?:json)?\s*([\s\S]*?)\s*```'
code_matches = re.findall(code_block_pattern, response)
for match in code_matches:
try:
result = json.loads(match)
if isinstance(result, list):
return result
except json.JSONDecodeError:
continue
print(f"警告: 无法解析 JSON 响应,原文: {response[:200]}...")
return []


# ==================== 去重与合并 ====================

def merge_articles(all_articles: list[dict]) -> list[dict]:
"""
合并和去重提取的条文
Args:
all_articles: 所有提取的条文列表
Returns:
去重合并后的条文列表
"""
# 按条文编号分组
article_map = {}
for article in all_articles:
article_num = article.get("article_number", "")
if not article_num:
continue
# 标准化条文编号(去除空格)
article_num = article_num.strip()
if article_num not in article_map:
article_map[article_num] = article
else:
# 如果已存在,选择更完整的版本
existing = article_map[article_num]
existing_content = existing.get("content", "")
new_content = article.get("content", "")
# 选择内容更长且完整的版本
if len(new_content) > len(existing_content):
article_map[article_num] = article
elif article.get("is_complete", False) and not existing.get("is_complete", False):
article_map[article_num] = article
# 按条文编号排序
def sort_key(item):
article_num = item[0]
# 提取数字进行排序
numbers = re.findall(r'\d+', article_num)
if numbers:
return int(numbers[0])
return 0
sorted_articles = sorted(article_map.items(), key=sort_key)
return [article for _, article in sorted_articles]


# ==================== 主流程 ====================

def extract_law_articles(pdf_path: str, output_path: str,
model_path: str = MODEL_NAME,
window_size: int = WINDOW_SIZE,
overlap: int = OVERLAP_SIZE):
"""
主提取函数
Args:
pdf_path: PDF 文件路径
output_path: 输出 JSON 文件路径
model_path: 模型路径或名称
window_size: 滑动窗口大小
overlap: 窗口重叠大小
"""
print("=" * 50)
print("法律条文提取工具")
print("=" * 50)
# 1. 提取 PDF 文本
print(f"\n[1/4] 正在读取 PDF: {pdf_path}")
full_text = extract_text_from_pdf(pdf_path)
print(f"提取文本长度: {len(full_text)} 字符")
# 2. 加载模型
print(f"\n[2/4] 正在加载模型...")
model, tokenizer = load_model(model_path)
# 3. 滑动窗口提取
print(f"\n[3/4] 正在使用滑动窗口提取条文...")
print(f"窗口大小: {window_size}, 重叠: {overlap}")
all_articles = []
windows = list(sliding_window(full_text, window_size, overlap))
for window_idx, window_text in tqdm(windows, desc="处理窗口"):
try:
articles = extract_articles_from_window(model, tokenizer, window_text)
all_articles.extend(articles)
except Exception as e:
print(f"\n窗口 {window_idx} 处理失败: {e}")
continue
print(f"\n共提取到 {len(all_articles)} 条原始记录")
# 4. 合并去重
print(f"\n[4/4] 正在合并去重...")
merged_articles = merge_articles(all_articles)
print(f"去重后剩余 {len(merged_articles)} 条")
# 5. 保存结果
output_data = {
"source_file": os.path.basename(pdf_path),
"total_articles": len(merged_articles),
"extraction_config": {
"model": model_path,
"window_size": window_size,
"overlap": overlap
},
"articles": merged_articles
}
# 确保输出目录存在
output_dir = os.path.dirname(output_path)
if output_dir:
os.makedirs(output_dir, exist_ok=True)
with open(output_path, "w", encoding="utf-8") as f:
json.dump(output_data, f, ensure_ascii=False, indent=2)
print(f"\n结果已保存到: {output_path}")
print("=" * 50)
print("提取完成!")
print("=" * 50)
return merged_articles


# ==================== 命令行入口 ====================

def main():
parser = argparse.ArgumentParser(
description="从 PDF 文档中提取法律条文"
)
parser.add_argument(
"--pdf", "-p",
type=str,
required=True,
help="输入的 PDF 文件路径"
)
parser.add_argument(
"--output", "-o",
type=str,
default=None,
help="输出的 JSON 文件路径(默认保存在 resources 目录)"
)
parser.add_argument(
"--model", "-m",
type=str,
default=MODEL_NAME,
help=f"模型名称或路径(默认: {MODEL_NAME})"
)
parser.add_argument(
"--window-size", "-w",
type=int,
default=WINDOW_SIZE,
help=f"滑动窗口大小(默认: {WINDOW_SIZE})"
)
parser.add_argument(
"--overlap",
type=int,
default=OVERLAP_SIZE,
help=f"窗口重叠大小(默认: {OVERLAP_SIZE})"
)
args = parser.parse_args()
# 默认输出路径
if args.output is None:
script_dir = Path(__file__).parent
resources_dir = script_dir / "resources"
resources_dir.mkdir(exist_ok=True)
pdf_name = Path(args.pdf).stem
args.output = str(resources_dir / f"{pdf_name}_articles.json")
# 执行提取
extract_law_articles(
pdf_path=args.pdf,
output_path=args.output,
model_path=args.model,
window_size=args.window_size,
overlap=args.overlap
)


if __name__ == "__main__":
main()

+ 25
- 0
develop/RAG/requirements.txt View File

@@ -0,0 +1,25 @@
# ==================== 核心依赖 ====================

# PDF 解析
PyMuPDF>=1.23.0

# 模型推理(Transformers 版本)
transformers>=4.37.0
torch>=2.0.0
accelerate>=0.25.0
bitsandbytes>=0.41.0 # 4bit 量化支持(Windows 需要特殊安装)

# 数据处理
tqdm>=4.66.0

# ==================== 可选依赖 ====================

accelerate
mindnlp>=0.5.1
transformers
torch>=2.9.0
gradio==5.49.1
markdown
pyyaml
latex2mathml
openi

+ 290
- 0
develop/RAG/verify_extraction.py View File

@@ -0,0 +1,290 @@
"""
法律条文提取结果验证脚本

功能:
1. 内容匹配检查:每个条目的 content 是否能在原 PDF 中找到(目标 >= 98%)
2. 编号顺序检查:条文编号是否按顺序排列(失序率目标 <= 2%)

用法:
python verify_extraction.py -p <原PDF路径> -j <提取结果JSON路径>
"""

import os
import json
import re
import argparse
from pathlib import Path
from typing import List, Dict, Tuple

import fitz # PyMuPDF

# ==================== 中文数字转阿拉伯数字 ====================

CN_NUM = {
'零': 0, '一': 1, '二': 2, '三': 3, '四': 4,
'五': 5, '六': 6, '七': 7, '八': 8, '九': 9,
'十': 10, '百': 100, '千': 1000
}


def cn_to_arabic(cn_str: str) -> int:
"""
将中文数字转换为阿拉伯数字
例如: "第一千二百六十条" -> 1260
"""
# 提取数字部分
cn_str = cn_str.replace('第', '').replace('条', '').strip()

# 如果是纯阿拉伯数字
if cn_str.isdigit():
return int(cn_str)

result = 0
temp = 0

for char in cn_str:
if char in CN_NUM:
num = CN_NUM[char]
if num >= 10:
if temp == 0:
temp = 1
if num == 10:
result += temp * 10
temp = 0
elif num == 100:
result += temp * 100
temp = 0
elif num == 1000:
result += temp * 1000
temp = 0
else:
temp = num

result += temp
return result


# ==================== PDF 文本提取 ====================

def extract_text_from_pdf(pdf_path: str) -> str:
"""从 PDF 提取全文"""
doc = fitz.open(pdf_path)
text_parts = []
for page in doc:
text_parts.append(page.get_text())
doc.close()
return "\n".join(text_parts)


def normalize_text(text: str) -> str:
"""
规范化文本用于匹配
- 去除所有空白字符(空格、换行、制表符等)
- 统一全角半角
"""
# 去除所有空白
text = re.sub(r'\s+', '', text)
return text


# ==================== 检查函数 ====================

def flatten_articles(articles: List) -> List[Dict]:
"""展平嵌套的 articles 列表"""
result = []
for art in articles:
if isinstance(art, dict):
result.append(art)
elif isinstance(art, list):
# 递归展平
result.extend(flatten_articles(art))
return result


def check_content_match(pdf_text: str, articles: List[Dict]) -> Tuple[int, int, List[Dict]]:
"""
检查内容匹配率

返回: (匹配数, 总数, 未匹配列表)
"""
# 先展平
articles = flatten_articles(articles)

# 规范化 PDF 文本
normalized_pdf = normalize_text(pdf_text)

matched = 0
unmatched = []

for art in articles:
if not isinstance(art, dict):
continue
content = art.get('content', '')
if not content:
continue

# 规范化条文内容
normalized_content = normalize_text(content)

# 检查是否在 PDF 中存在
# 由于 PDF 提取可能有微小差异,我们用子串匹配
# 取内容的前 50 个字符做匹配(避免完整匹配过于严格)
search_str = normalized_content[:min(50, len(normalized_content))]

if search_str in normalized_pdf:
matched += 1
else:
# 尝试更宽松的匹配(前 30 个字符)
search_str_short = normalized_content[:min(30, len(normalized_content))]
if search_str_short in normalized_pdf:
matched += 1
else:
unmatched.append({
'article_number': art.get('article_number', '未知'),
'content_preview': content[:100] + '...' if len(content) > 100 else content
})

return matched, len(articles), unmatched


def check_number_order(articles: List[Dict]) -> Tuple[int, int, List[Dict]]:
"""
检查编号顺序

返回: (正序数, 总数, 失序列表)
"""
# 先展平
articles = flatten_articles(articles)

if not articles:
return 0, 0, []

ordered = 0
disordered = []
prev_num = 0

for i, art in enumerate(articles):
article_number = art.get('article_number', '')

try:
current_num = cn_to_arabic(article_number)
except:
# 无法解析的编号
disordered.append({
'index': i,
'article_number': article_number,
'reason': '无法解析编号'
})
continue

if current_num > prev_num:
ordered += 1
elif current_num == prev_num:
# 重复编号(可能是分款)
ordered += 1 # 暂时视为正常
else:
disordered.append({
'index': i,
'article_number': article_number,
'prev_number': prev_num,
'current_number': current_num,
'reason': f'顺序错误: 前一条是第{prev_num}条,当前是第{current_num}条'
})

prev_num = current_num

return ordered, len(articles), disordered


# ==================== 主程序 ====================

def main():
parser = argparse.ArgumentParser(description="验证法律条文提取结果")
parser.add_argument("--pdf", "-p", required=True, help="原 PDF 文件路径")
parser.add_argument("--json", "-j", required=True, help="提取结果 JSON 文件路径")
parser.add_argument("--verbose", "-v", action="store_true", help="显示详细信息")

args = parser.parse_args()

# 检查文件存在
if not os.path.exists(args.pdf):
print(f"错误: PDF 文件不存在: {args.pdf}")
return
if not os.path.exists(args.json):
print(f"错误: JSON 文件不存在: {args.json}")
return

print("=" * 60)
print("法律条文提取结果验证")
print("=" * 60)

# 1. 加载数据
print(f"\n[1] 加载数据...")
print(f" PDF: {args.pdf}")
print(f" JSON: {args.json}")

pdf_text = extract_text_from_pdf(args.pdf)
with open(args.json, 'r', encoding='utf-8') as f:
data = json.load(f)

articles = data.get('articles', [])
print(f" 条文总数: {len(articles)}")

# 2. 内容匹配检查
print(f"\n[2] 内容匹配检查...")
matched, total, unmatched = check_content_match(pdf_text, articles)
match_rate = matched / total * 100 if total > 0 else 0

print(f" 匹配数: {matched}/{total}")
print(f" 匹配率: {match_rate:.2f}%")

if match_rate >= 98:
print(f" ✅ 通过 (>= 98%)")
else:
print(f" ❌ 未通过 (< 98%)")

if args.verbose and unmatched:
print(f"\n 未匹配条目:")
for item in unmatched[:10]: # 只显示前10个
print(f" - {item['article_number']}: {item['content_preview'][:50]}...")
if len(unmatched) > 10:
print(f" ... 还有 {len(unmatched) - 10} 个未匹配")

# 3. 编号顺序检查
print(f"\n[3] 编号顺序检查...")
ordered, total, disordered = check_number_order(articles)
order_rate = ordered / total * 100 if total > 0 else 0
disorder_rate = 100 - order_rate

print(f" 正序数: {ordered}/{total}")
print(f" 失序率: {disorder_rate:.2f}%")

if disorder_rate <= 2:
print(f" ✅ 通过 (<= 2%)")
else:
print(f" ❌ 未通过 (> 2%)")

if args.verbose and disordered:
print(f"\n 失序条目:")
for item in disordered[:10]:
print(f" - 索引 {item['index']}: {item['article_number']} - {item['reason']}")
if len(disordered) > 10:
print(f" ... 还有 {len(disordered) - 10} 个失序")

# 4. 总结
print("\n" + "=" * 60)
print("验证结果总结")
print("=" * 60)
print(f" 内容匹配率: {match_rate:.2f}% {'✅' if match_rate >= 98 else '❌'}")
print(f" 编号失序率: {disorder_rate:.2f}% {'✅' if disorder_rate <= 2 else '❌'}")

if match_rate >= 98 and disorder_rate <= 2:
print("\n🎉 全部验证通过!")
else:
print("\n⚠️ 存在问题,请检查提取逻辑")

print("=" * 60)


if __name__ == "__main__":
main()

+ 144
- 0
develop/README.md View File

@@ -0,0 +1,144 @@
# ChatLaw 开发模块

本目录用于 ChatLaw 项目的功能开发。

## 目录结构

```
develop/
├── README.md # 本文件
├── resources/ # 资源文件目录
│ ├── Legal Documents/ # 法律 PDF 文件存放目录
│ ├── slidding window/ # 滑动窗口提取结果输出
│ └── output/ # 批量处理输出目录
└── RAG/ # RAG(检索增强生成)模块
├── requirements.txt # Python 依赖
├── extract_regex_llm_repair.py # 法律条文提取(正则+LLM+窗口修复,推荐)
├── extract_law_pure_regex.py # 法律条文提取(纯正则,极速)
├── extract_sliding_window.py # 法律条文提取(滑动窗口,旧版)
├── batch_extract_all_laws.py # 批量提取脚本
└── verify_extraction.py # 提取结果验证脚本
```

---

## RAG 模块

### 功能说明

从 PDF 法律文档中提取结构化的法律条文,输出 JSON 格式。

### 提取方案对比

| 方案 | 脚本 | 速度 | 准确率 | 依赖 |
|------|------|------|--------|------|
| **正则+LLM+窗口修复(推荐)** | `extract_regex_llm_repair.py` | 中等 | 高 | GPU + 模型 |
| 纯正则 | `extract_law_pure_regex.py` | **极快** | 低 | 无 |
| 滑动窗口+LLM | `extract_sliding_window.py` | 慢 | 中 | GPU + 模型 |

### extract_regex_llm_repair.py 处理流程

```
PDF全文
第一轮:正则切分(按"第X条"分割)
LLM 批量结构化(提取 article_number, content, title)
检测不完整条目(article_number 为空 或 content 过短)
第二轮:大窗口修复(扩展 2000 字符上下文,重新提取)
输出 JSON
```

### 快速开始

#### 方式一:正则+LLM(推荐)

1. **安装依赖**
```bash
cd RAG
pip install -r requirements.txt
```

2. **下载模型**(约 15GB)
```bash
# 从 ModelScope 下载
modelscope download --model Qwen/Qwen2.5-7B-Instruct --local_dir ../resources/Qwen2.5-7B-Instruct
```

3. **运行提取**
```bash
python extract_regex_llm_repair.py -p "你的PDF文件路径"
```

4. **验证结果**(基于正则匹配验证)
```bash
python verify_extraction.py -p "原PDF路径" -j "输出JSON路径" -v
```

### 方式二:纯正则(极速,无需模型)

```bash
python extract_law_pure_regex.py -p "D:\Legal Documents\中华人民共和国民法典_20200528.pdf"
```

### 批量处理所有法律文件

```bash
python batch_extract_all_laws.py
```

> 注意:需先修改 `batch_extract_all_laws.py` 中的路径配置

## 命令行参数

### extract_regex_llm_repair.py

| 参数 | 简写 | 说明 | 默认值 |
|------|------|------|--------|
| `--pdf` | `-p` | 输入 PDF 文件路径 | 必填 |
| `--output` | `-o` | 输出 JSON 文件路径 | `./resources/extracted_v2/<pdf名>_struct.json` |
| `--model` | `-m` | 模型路径 | `./resources/Qwen2.5-7B-Instruct`(相对路径) |
| `--batch` | `-b` | 批处理大小 | 8 |

### verify_extraction.py

| 参数 | 简写 | 说明 |
|------|------|------|
| `--pdf` | `-p` | 原 PDF 文件路径 |
| `--json` | `-j` | 提取结果 JSON 路径 |
| `--verbose` | `-v` | 显示详细信息 |

## 输出格式示例

```json
{
"source": "中华人民共和国民法典_20200528.pdf",
"total": 1260,
"articles": [
{
"article_number": "第一条",
"content": "为了保护民事主体的合法权益,调整民事关系,维护社会和经济秩序,适应中国特色社会主义发展要求,弘扬社会主义核心价值观,根据宪法,制定本法。",
"title": ""
}
]
}
```

## 核心依赖

- Python 3.10+
- PyMuPDF >= 1.23.0(PDF 解析)
- transformers >= 4.37.0(模型推理)
- torch >= 2.0.0(深度学习框架)
- bitsandbytes >= 0.41.0(4bit 量化,可选)
- tqdm >= 4.66.0(进度条)

## 开发说明

1. 所有代码和文档使用中文注释
2. 下载的模型和生成的文件请放入 `resources/` 目录
3. `resources/` 目录已加入 `.gitignore`,不会推送到远程
4. 测试通过后再提交 PR

+ 1
- 2
requirements.txt View File

@@ -2,9 +2,8 @@ accelerate
mindnlp>=0.5.1
transformers
torch>=2.9.0
gradio
gradio==5.49.1
markdown
pyyaml
latex2mathml
markdown
openi

scripts/orangepi/CANN_installer.sh → scripts/orangepi/310b/CANN_installer.sh View File

@@ -27,7 +27,7 @@ cd /home/HwHiAiUser/Downloads || {
cd /home/HwHiAiUser/Downloads || exit

openi model download enter/nodule_segmentation cann-toolkit --save_path .
openi model download enter/nodule_segmentation cann-kernels --save_path .
openi model download enter/nodule_segmentation cann-kernels_310b --save_path .

export TOOLKIT_NAME=$(python -c "import os;import fnmatch;prefix_toolkit='Ascend-cann-toolkit'; extension='.run';found_toolkit = any(f.startswith(prefix_toolkit) and f.endswith(extension) for f in os.listdir('.')); toolkit_path = next((f for f in os.listdir(os.getcwd()) if fnmatch.fnmatch(f, 'Ascend-cann-toolkit*')), None); print(toolkit_path)")
export KERNELS_NAME=$(python -c "import os;import fnmatch;prefix_kernels='Ascend-cann-kernels';extension='.run';found_kernels = any(f.startswith(prefix_kernels) and f.endswith(extension) for f in os.listdir('.'));kernels_path = next((f for f in os.listdir(os.getcwd()) if fnmatch.fnmatch(f, 'Ascend-cann-kernels*')), None);print( kernels_path);")

+ 1
- 0
scripts/orangepi/310b/aclinit.json View File

@@ -0,0 +1 @@
{"err_msg_mode":"1"}

scripts/orangepi/mindspore_installer.sh → scripts/orangepi/310b/mindspore_installer.sh View File

@@ -7,7 +7,17 @@ pip install sympy
pip install /usr/local/Ascend/ascend-toolkit/latest/lib64/te-*-py3-none-any.whl
pip install /usr/local/Ascend/ascend-toolkit/latest/lib64/hccl-*-py3-none-any.whl

export MS_VERSION=2.5.0
export MS_VERSION=2.7.1
# 本脚本请用user模式运行
# 本脚本用于安装mindspore并配置环境变量
pip config set global.index-url https://mirrors.huaweicloud.com/repository/pypi/simple
python -m pip install -U pip

pip install sympy
pip install /usr/local/Ascend/ascend-toolkit/latest/lib64/te-*-py3-none-any.whl
pip install /usr/local/Ascend/ascend-toolkit/latest/lib64/hccl-*-py3-none-any.whl

export MS_VERSION=2.7.1
pip install \
https://ms-release.obs.cn-north-4.myhuaweicloud.com/${MS_VERSION}/MindSpore/unified/aarch64/mindspore-${MS_VERSION/-/}-cp311-cp311-linux_aarch64.whl \
--trusted-host ms-release.obs.cn-north-4.myhuaweicloud.com \
@@ -15,6 +25,20 @@ https://ms-release.obs.cn-north-4.myhuaweicloud.com/${MS_VERSION}/MindSpore/unif
pip install jinja2 absl-py


{
echo "export GLOG_v=2";
echo "LOCAL_ASCEND=/usr/local/Ascend";
echo "source \${LOCAL_ASCEND}/ascend-toolkit/set_env.sh" ;
} >> ~/.bashrc

export GLOG_v=2
LOCAL_ASCEND=/usr/local/Ascend
source ${LOCAL_ASCEND}/ascend-toolkit/set_env.sh
python -c "import mindspore;mindspore.set_context(device_target='Ascend');mindspore.run_check()"

pip install jinja2 absl-py


{
echo "export GLOG_v=2";
echo "LOCAL_ASCEND=/usr/local/Ascend";

scripts/orangepi/preparation.sh → scripts/orangepi/310b/preparation.sh View File

@@ -11,7 +11,9 @@ FILE="/swapfile"
# 检查文件是否存在
if [ ! -e "$FILE" ]; then
echo "swapfile 不存在,开始执行swap分区创建"
echo "swapfile 不存在,执行某些命令..."

sudo fallocate -l 16G /swapfile
sudo mkswap /swapfile
sudo swapon /swapfile

+ 42
- 0
scripts/orangepi/310p/CANN_installer.sh View File

@@ -0,0 +1,42 @@
# 本脚本必须以root用户模式运行,否则将无法正常安装CANN
# 本脚本实现以下功能:
# pip下载openi,并在openi中下载CANN

# 检查是否以 root 用户运行
if [ "$(id -u)" -ne 0 ]; then
echo "此脚本不允许用user模式运行。"
exit 1
fi
pip config set global.index-url https://mirrors.huaweicloud.com/repository/pypi/simple || {
echo "此脚本不允许用sudo bash命令运行,请以sudo bash -i命令运行"
}
pip install openi
set -e # 如果任何命令失败,则立即退出脚本
cd /usr/local/Ascend/ascend-toolkit || {
echo "Error: Failed to change directory to /usr/local/Ascend/ascend-toolkit. No files will be deleted." >&2
exit 1
}

# shellcheck disable=SC2035
rm -rf * # 这将在cd成功后执行,如果cd失败则不会执行到这里,防止文件被误删
cd /home/HwHiAiUser/Downloads || {
echo "Error: Failed to change directory to /home/HwHiAiUser/Downloads." >&2
exit 1
}

cd /home/HwHiAiUser/Downloads || exit

openi model download enter/nodule_segmentation cann-toolkit --save_path .
openi model download enter/nodule_segmentation cann-kernels_310p --save_path .

export TOOLKIT_NAME=$(python -c "import os;import fnmatch;prefix_toolkit='Ascend-cann-toolkit'; extension='.run';found_toolkit = any(f.startswith(prefix_toolkit) and f.endswith(extension) for f in os.listdir('.')); toolkit_path = next((f for f in os.listdir(os.getcwd()) if fnmatch.fnmatch(f, 'Ascend-cann-toolkit*')), None); print(toolkit_path)")
export KERNELS_NAME=$(python -c "import os;import fnmatch;prefix_kernels='Ascend-cann-kernels';extension='.run';found_kernels = any(f.startswith(prefix_kernels) and f.endswith(extension) for f in os.listdir('.'));kernels_path = next((f for f in os.listdir(os.getcwd()) if fnmatch.fnmatch(f, 'Ascend-cann-kernels*')), None);print( kernels_path);")

chmod +x ./${TOOLKIT_NAME}
chmod +x ./${KERNELS_NAME}
./${TOOLKIT_NAME} --install --quiet
if mkdir -p /usr/local/Ascend/ascend-toolkit/latest; then
./${KERNELS_NAME} --install --quiet
else
echo "请确认cann已经安装成功"
fi

+ 51
- 0
scripts/orangepi/310p/mindspore_installer.sh View File

@@ -0,0 +1,51 @@
# 本脚本请用user模式运行
# 本脚本用于安装mindspore并配置环境变量
pip config set global.index-url https://mirrors.huaweicloud.com/repository/pypi/simple
python -m pip install -U pip

pip install sympy
pip install /usr/local/Ascend/ascend-toolkit/latest/lib64/te-*-py3-none-any.whl
pip install /usr/local/Ascend/ascend-toolkit/latest/lib64/hccl-*-py3-none-any.whl

export MS_VERSION=2.7.1
# 本脚本请用user模式运行
# 本脚本用于安装mindspore并配置环境变量
pip config set global.index-url https://mirrors.huaweicloud.com/repository/pypi/simple
python -m pip install -U pip

pip install sympy
pip install /usr/local/Ascend/ascend-toolkit/latest/lib64/te-*-py3-none-any.whl
pip install /usr/local/Ascend/ascend-toolkit/latest/lib64/hccl-*-py3-none-any.whl

export MS_VERSION=2.7.1
pip install \
https://ms-release.obs.cn-north-4.myhuaweicloud.com/${MS_VERSION}/MindSpore/unified/aarch64/mindspore-${MS_VERSION/-/}-cp311-cp311-linux_aarch64.whl \
--trusted-host ms-release.obs.cn-north-4.myhuaweicloud.com \
-i https://pypi.tuna.tsinghua.edu.cn/simple
pip install jinja2 absl-py


{
echo "export GLOG_v=2";
echo "LOCAL_ASCEND=/usr/local/Ascend";
echo "source \${LOCAL_ASCEND}/ascend-toolkit/set_env.sh" ;
} >> ~/.bashrc

export GLOG_v=2
LOCAL_ASCEND=/usr/local/Ascend
source ${LOCAL_ASCEND}/ascend-toolkit/set_env.sh
python -c "import mindspore;mindspore.set_context(device_target='Ascend');mindspore.run_check()"

pip install jinja2 absl-py


{
echo "export GLOG_v=2";
echo "LOCAL_ASCEND=/usr/local/Ascend";
echo "source \${LOCAL_ASCEND}/ascend-toolkit/set_env.sh" ;
} >> ~/.bashrc

export GLOG_v=2
LOCAL_ASCEND=/usr/local/Ascend
source ${LOCAL_ASCEND}/ascend-toolkit/set_env.sh
python -c "import mindspore;mindspore.set_context(device_target='Ascend');mindspore.run_check()"

+ 46
- 0
scripts/orangepi/310p/preparation.sh View File

@@ -0,0 +1,46 @@
#!/bin/bash
# 本脚本必须在user用户下运行
# 本脚本实现以下功能
# 1.自动配置16G的swap内存(若已配置名为swapfile的文件则跳过)
# 2.自动创建名为mindspore的conda环境,并配置环境变量在打开终端后自动进入该环境
# 3.自动配置control CPU的个数为4,AI CPU的数量为0
# 4.自动配置静态IP为192.168.137.100,子网掩码为255.255.255.0
# 定义要查找的文件路径
FILE="/swapfile"
# 检查文件是否存在
if [ ! -e "$FILE" ]; then
echo "swapfile 不存在,执行某些命令..."

sudo fallocate -l 16G /swapfile
sudo mkswap /swapfile
sudo swapon /swapfile
sudo chmod 600 /swapfile
echo '/swapfile none swap sw 0 0' | sudo tee -a /etc/fstab
echo "swapfile 已创建并配置为swap分区。"
else
echo "swapfile 已存在。"
fi

if ! conda env list | grep -q mindspore; then
# 如果不存在,则创建名为mindspore环境,并设置重启shell后自启动该环境
conda create -n mindspore python=3.11 -y
echo "conda activate mindspore" >> ~/.bashrc
echo "mindspore环境已创建。"
else
echo "mindspore环境已存在。"
fi

# 配置control CPU的数量为4
sudo npu-smi set -t cpu-num-cfg -i 0 -c 0 -v 0:4:0

# 配置ip
sudo nmcli con mod "Wired connection 1" \
ipv4.addresses "192.168.137.100/24" \
ipv4.gateway "192.168.137.1" \
ipv4.dns "8.8.8.8" ipv4.method "manual"

sudo reboot

+ 53
- 0
scripts/orangepi/README.md View File

@@ -0,0 +1,53 @@
# 香橙派mindspore环境配置脚本

本仓库根据芯片型号不同分别放置三个shell脚本文件,用于自动配置香橙派aipro的mindspore 2.7.1 的Ascend环境

#### 使用说明
**一.脚本功能说明**
1. preparation.sh脚本用于实现香橙派环境配置的准备工作,包含以下步骤:
- 配置16G的swap内存,若已经配置,则本步骤跳过
- 创建名为mindspore的conda环境,并配置环境变量使shell重启时自动激活
- 配置control CPU的个数为4,AI CPU的个数为0
- 配置静态IP为192.168.137.100,子网掩码为255.255.255.0
- 上述配置完成后立即重启开发板使配置生效
2. CANN_installer.sh脚本用于下载并安装CANN 8.1RC1的toolkit和kernels包,包含以下步骤:
- 删除当前存在的toolkit包
- 在/home/HwHiAiUser/Downloads目录下载toolkit和kernels包
- 安装toolkit包,经高人指点此处无需手动同意用户协议,静止等待安装即可
- 安装kernels包,经高人指点此处无需手动同意用户协议,静止等待安装即可
3. mindspore_installer.sh脚本用于安装mindspore,配置环境变量并运行run_check(),包含以下步骤:
- 安装昇腾AI开发者工具包
- 安装mindspore必要依赖
- 安装mindspore
- 配置环境变量
- 运行run_check()

**二.脚本运行方法说明**

必须按照以下顺序和命令运行脚本,且均在user模式下运行,若user用户和root用户使用错误将导致安装失败


```
bash preparation.sh # 该脚本运行完毕后会自动重启系统,不用惊慌
```

```
sudo bash -i CANN_installer.sh # 注意该脚本必须在user用户下以sudo -i命令执行
```

```
bash mindspore_installer.sh
```

_三个脚本均包含需要联网执行的命令,故运行过程中需要全程保持网络畅通_

**三.其他说明**

脚本中配置静态IP的步骤是用于从Windows通过以太网(网线)用ssh连接开发板,Windows端的配置以及免密码连接的设置无法

通过脚本进行,需要手动配置。配置方法可以参考我的这篇文章:

【基于官方教程的补充的香橙派mindspore环境配置教程 - CSDN App】
https://blog.csdn.net/weixin_74531285/article/details/143940560?sharetype=blog&shareId=143940560&sharerefer=APP&sharesource=weixin_74531285&sharefrom=link

只需要关注Windows端的配置以及将Windows的公钥写入开发板的.ssh目录部分内容即可,其他在开发板端的配置脚本可以完成。

Loading…
Cancel
Save
Baidu
map