# 步骤 1: 读取和清洗文本
# 目标：从文本文件中读取内容，进行清洗并分段。
import re
with open('二进桂师.txt', 'r', encoding='utf-8') as file:
    text = file.read()


# Step 1: 文本清洗与分段
def clean_and_split_text(text):
    # 1. 清除所有空白字符（包括空格、制表符、换行符等）
    cleaned_text = re.sub(r'\s+', '', text)
    
    # 2. 清除重复的标点符号
    cleaned_text = re.sub(r'[。！？]+', '。', cleaned_text)
    
    # 3. 移除特殊字符和无效值（根据需要可以添加更多）
    cleaned_text = re.sub(r'[①②③④⑤⑥⑦⑧⑨⑩""''（）\(\)\[\]\{\}]', '', cleaned_text)
    
    # 4. 分割句子
    sentences = re.split(r'。', cleaned_text)
    
    # 5. 过滤空字符串和只包含标点的句子
    filtered_sentences = []
    for sentence in sentences:
        sentence = sentence.strip()
        if sentence and not re.match(r'^[，、：；]+$', sentence):
            # 6. 去除重复的句子
            if sentence not in filtered_sentences:
                filtered_sentences.append(sentence)
    
    return filtered_sentences    

# 获取分段的句子
sentences = clean_and_split_text(text)

# 输出前几段以检查结果
print("分段后的文本:")
print(sentences[:5])

# 保存清洗后的文本
with open('clean.txt', 'w', encoding='utf-8') as file:
    # 将清洗后的句子重新组合，使用句号连接
    cleaned_text = '。'.join(sentences) + '。'  # 添加最后的句号
    file.write(cleaned_text)

分段后的文本:
['二进桂师唐肇华我曾两次进桂师工作，避风', '第一次是在1940年2月，我被迫离开灵川国民中学，到桂师任师训班导师和二班物理教师，7月回广西大学复学；第二次是在1941年9月，我被武装接长西大的高阳勒令离校，回桂师任三、四合班导师和师五班物理课，翌年7月离校，两次都是唐现之校长得知我的困境后，叫我到桂师工作的', '1938年11月我在桂林接中共广西省工委指示参加广西学生军，翌年一月受命打入三青团干训班受训，以争夺该团部分领导权', '2月，干训班结束，我便前往全州县筹建三青团分团', '省工委指示，全州县是广西的北大门，地理位置很重要，是蒋介石南下的要冲，驻有他的嫡系部队新五军军长杜聿明军部和该部主力陆军二百师，要利用三青团这个公开组织为合法阵地，按上级党的指示宣传党的抗日民族统一战线的方针政策，团结广大青年群众，开展抗日救亡活动，以适应革命形势发展的需要，并在可能条件下把影响伸进军营']

<>:17: SyntaxWarning: invalid escape sequence '\('
<>:17: SyntaxWarning: invalid escape sequence '\('
C:\Users\Administrator\AppData\Local\Temp\ipykernel_16088\286409681.py:17: SyntaxWarning: invalid escape sequence '\('
  cleaned_text = re.sub(r'[①②③④⑤⑥⑦⑧⑨⑩""''（）\(\)\[\]\{\}]', '', cleaned_text)

# 导入必要的库
import jieba
from collections import Counter
import pandas as pd

# 加载文本数据
file_path = "二进桂师.txt"  
with open(file_path, "r", encoding="utf-8") as file:
    text = file.read()

# 导入哈工大停用词
with open("hit_stopwords.txt", "r", encoding="utf-8") as file:
    stopwords = set(file.read().splitlines())

# 文本分词并过滤停用词
words = [word for word in jieba.cut(text) if word.strip() and word not in stopwords]

# 统计词频
word_freq = Counter(words)

# 将词频结果转为 DataFrame
df_word_freq = pd.DataFrame(word_freq.items(), columns=["词语", "频率"]).sort_values(by="频率", ascending=False)

# 筛选高频词（频率 > 5）作为候选领域词汇
high_freq_words = df_word_freq[df_word_freq["频率"] > 5]["词语"].tolist()

# 人工筛选后领域词汇（这里可结合实际情况调整）
# 示例：领域词汇
custom_terms = high_freq_words  # 假设全选高频词，实际可手动调整

# 保存词频统计结果
df_word_freq.to_csv("词频统计结果.csv", index=False, encoding="utf-8")

# 保存自定义词库
with open("自定义词库.txt", "w", encoding="utf-8") as f:
    for term in custom_terms:
        f.write(f"{term}\n")

print("词频统计已保存为 '词频统计结果.csv'")
print("自定义词库已保存为 '自定义词库.txt'")

Building prefix dict from the default dictionary ...
Dumping model to file cache C:\Users\Administrator\AppData\Local\Temp\jieba.cache
Loading model cost 0.390 seconds.
Prefix dict has been built successfully.

词频统计已保存为 '词频统计结果.csv'
自定义词库已保存为 '自定义词库.txt'

##  加入jieba分词
#  导入jieba
import jieba

# 加载自定义词库和哈工大停用词
jieba.load_userdict("自定义词库.txt")

def load_stopwords(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        return set(line.strip() for line in f)

stopwords = load_stopwords("hit_stopwords.txt")

# 分词函数，支持去停用词
def segment_sentences(sentences, stopwords=None):
    segmented_sentences = []
    for sentence in sentences:
        words = jieba.lcut(sentence)  # 使用精确模式分词
        if stopwords:
            words = [word for word in words if word not in stopwords]
        segmented_sentences.append(" ".join(words))  # 拼接为分词后的字符串
    return segmented_sentences

# 加载文本数据并分割成句子
text_file = "二进桂师.txt"
with open(text_file, "r", encoding="utf-8") as file:
    text = file.read()

# 按句号、换行符分割文本为句子
sentences = [line.strip() for line in text.split('。') if line.strip()]

# 分词并去停用词
segmented_sentences = segment_sentences(sentences, stopwords=stopwords)

# 输出分词结果示例
print("分词后的句子示例:")
for i, sentence in enumerate(segmented_sentences[:5], 1):
    print(f"{i}: {sentence}")

# 保存分词后的文本
output_file = "jieba_result.txt"
with open(output_file, "w", encoding="utf-8") as file:
    # 将清洗后的句子重新组合，使用句号连接
    jieba_text = '。'.join(segmented_sentences) + '。'  # 添加最后的句号
    file.write(jieba_text)

print(f"分词结果已保存到文件 '{output_file}'")

分词后的句子示例:
1: 二   进   桂   师 
 唐   肇   华 
 曾 两次 进桂师 工作 避风
2: 第一次 1940 年 2 月 被迫 离开 灵川 国民中学 桂师 师训 班 导师 二班 物理 教师 7 月 回 广西大学 复学   第二次 1941 年 9 月 武装 接长 西大 高阳 勒令 离校 回 桂师 任三 四合 班 导师 师 五班 物理课 翌年 7 月 离校 两次 都 唐现 校长 得知 困境 后 桂师 工作
3: 1938 年 11 月 桂林 接 中共 广西省 工委 指示 参加 广   西   学生 军 翌年 一月 受命 打入 三青团 干训班 受训 争夺 该团 部分 领导权
4: 2 月 干训班 结束 便 前往 全州县 筹建 三青团 分团
5: 省 工委 指示 全州县 广西 北大 门 地理位置 很 重要 蒋介石 南下 要冲 驻有 嫡系 部队 新五军 军长 杜聿明   军部 该部 主力 陆军 二百 师 利用 三青团 公开 组织 合法 阵地 上级 党 指示 宣传 党 抗日民族统一战线 方针政策 团结 广大青年 群众 开展 抗日救亡 活动 适应 革命 形势 发展 需要 可能 条件 下 影响 伸进 军营
分词结果已保存到文件 'jieba_result.txt'

# 步骤 2: 命名实体识别（NER）
## 结合上述的 jieba 分词
# 导入必要的库
import os
import jieba
import hanlp
import pandas as pd
import datetime
from tqdm import tqdm
import re

# 加载 HanLP 预训练模型
HanLP = hanlp.load(hanlp.pretrained.mtl.CLOSE_TOK_POS_NER_SRL_DEP_SDP_CON_ELECTRA_BASE_ZH)

# 定义命名实体存储字典
entities = {
    "PERSON": [],  # 人名
    "GPE": [],     # 地名
    "ORG": [],     # 组织机构
    "DATE": [],    # 日期
    "EVENT":[]     # 事件
}

# 实体类型映射
entity_type_mapping = {
    "NR": "PERSON",  # 人名
    "NS": "GPE",     # 地名
    "NT": "ORG",     # 组织机构
    "TIME": "DATE",   # 时间
    "EVENT": "EVENT"    
}

# 日期补充识别正则表达式
date_patterns = [
    r'\d{4}年\d{1,2}月\d{1,2}日',  # 1949年10月1日
    r'\d{4}年\d{1,2}月',          # 1949年10月
    r'\d{4}年',                  # 1949年
    r'\d{1,2}月\d{1,2}日'         # 10月1日
]

# 加载自定义词库和停用词
jieba.load_userdict("自定义词库_modify.txt")  # 使用修改之后的自定义词库

def load_stopwords(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        return set(line.strip() for line in f)

stopwords = load_stopwords("hit_stopwords.txt")

# 分词预处理函数
def preprocess_sentences(sentences):
    preprocessed_sentences = []
    for sentence in sentences:
        words = [word for word in jieba.lcut(sentence) if word.strip() and word not in stopwords]
        preprocessed_sentences.append(" ".join(words))  # 拼接成分词后的句子
    return preprocessed_sentences

# 实体提取函数
def extract_entities(sentences):
    for sentence in tqdm(sentences, desc="正在进行命名实体识别"):
        # 调用 HanLP 模型进行 NER
        doc = HanLP(sentence)
        for ent in doc['ner/msra']:
            entity_text = ent[0]
            entity_type = entity_type_mapping.get(ent[1], ent[1])
            if entity_type in entities:
                entities[entity_type].append(entity_text)
        
        # 正则表达式补充日期提取
        for pattern in date_patterns:
            matches = re.findall(pattern, sentence)
            entities["DATE"].extend(matches)


# 处理文本数据
text_file = "二进桂师.txt"
with open(text_file, "r", encoding="utf-8") as file:
    text = file.read()

# 按句号和换行符分割文本
sentences = [line.strip() for line in text.split('。') if line.strip()]

# 预处理：分词并过滤停用词
preprocessed_sentences = preprocess_sentences(sentences)

# 提取实体
extract_entities(preprocessed_sentences)

# 实体去重
for key in entities:
    entities[key] = list(set(entities[key]))

# 输出提取结果
print("\n提取的命名实体:")
for key, value in entities.items():
    print(f"{key}: {value}")

# 创建 DataFrame
df_entities = pd.DataFrame()
for entity_type, values in entities.items():
    chinese_type = {
        "PERSON": "人名",
        "GPE": "地名",
        "ORG": "组织机构",
        "DATE": "日期"
    }.get(entity_type, entity_type)
    df_temp = pd.DataFrame({
        '实体类型': [chinese_type] * len(values),
        '实体内容': values
    })
    df_entities = pd.concat([df_entities, df_temp])

# 重置索引
df_entities.reset_index(drop=True, inplace=True)

# 保存结果到 CSV 文件
timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
output_file = f"entities_{timestamp}.csv"
try:
    df_entities.to_csv(output_file, index=False, encoding='utf-8-sig')
    print(f"结果已保存至: {output_file}")
except Exception as e:
    print(f"保存文件失败: {e}")

# 输出统计信息
print("\n实体统计信息:")
print(df_entities['实体类型'].value_counts())

正在进行命名实体识别: 100%|██████████| 64/64 [00:05<00:00, 11.58it/s]

提取的命名实体:
PERSON: ['马君武', '陈岸', '于辉坤', '马坤元', '欧苇', '桂师', '唐 肇 华', '杜聿明', '雷', '唐', '康', '周可传', '张丽 贞', '陈赐珍', '李宗仁', '李', '赵', '高', '李四光', '唐现之', '雷沛 鸿', '王祥彻', '黄 立志', '石文忠', '赵建勋', '蒋介石', '梁漱溟', '钱念文', '梁', '汤', '高阳', '张丽贞', '汤松年', '汤 观感']
GPE: []
ORG: []
DATE: ['一九八九年', '翌年', '8 月', '中旬', '7 月', '1941 年', '一月', '6 月', '11 月', '10 月', '31', '晚上', '1942', '2 月', '九月', '1938 年', '9 月', '1941', '1940 年', '七 九', '六月', '暑假']
EVENT: []
结果已保存至: entities_20241214_215453.csv

实体统计信息:
实体类型
人名    34
日期    22
Name: count, dtype: int64

## 将jsonl文件华为BIO格式 
import json

def jsonl_to_bio(input_file, output_file):
    with open(input_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', encoding='utf-8') as outfile:
        for line in infile:
            # 解析每一行的 JSON 数据
            data = json.loads(line.strip())
            text = data['text']
            entities = data['entities']
            
            # 初始化 BIO 标签
            bio_tags = ['O'] * len(text)
            
            # 标注实体
            for entity in entities:
                start = entity['start_offset']
                end = entity['end_offset']
                label = entity['label']
                
                # 标注 BIO 格式
                bio_tags[start] = f"B-{label}"  # 实体起始为 B-<LABEL>
                for i in range(start + 1, end):
                    bio_tags[i] = f"I-{label}"  # 实体中间部分为 I-<LABEL>
            
            # 将字符和标签逐行写入文件
            for char, tag in zip(text, bio_tags):
                if char.strip():  # 跳过空白字符
                    outfile.write(f"{char} {tag}\n")
            outfile.write("\n")  # 每段文本间隔空行

# 转换文件
input_file = '1222_annotate.jsonl'
output_file = '1222_bio_output.txt'
jsonl_to_bio(input_file, output_file)

## 从 BIO 数据中提取实体及其标签，组织成三元组 (subject, predicate, object) 格式。

import time

def bio_to_entities_relations(bio_file, output_file_entities, output_file_relations):
    entities = []
    relations = []
    current_entity = {"type": None, "start": None, "end": None, "text": ""}
    with open(bio_file, "r", encoding="utf-8") as file:
        for line in file:
            line = line.strip()
            if not line:  # 遇到空行，处理当前实体并清空
                if current_entity["type"]:
                    entities.append(current_entity)
                    current_entity = {"type": None, "start": None, "end": None, "text": ""}
                continue

            word, label = line.split()
            if label.startswith("B-"):
                # 保存上一个实体
                if current_entity["type"]:
                    entities.append(current_entity)
                # 开始新实体
                current_entity = {"type": label[2:], "start": len(entities), "end": len(entities) + 1, "text": word}
            elif label.startswith("I-") and current_entity["type"] == label[2:]:
                current_entity["text"] += word
                current_entity["end"] += 1
            else:
                # 遇到非实体标记，保存当前实体
                if current_entity["type"]:
                    entities.append(current_entity)
                    current_entity = {"type": None, "start": None, "end": None, "text": ""}

    # 将实体和关系保存到文件
    with open(output_file_entities + "_" + str(int(time.time())) + ".txt", "w", encoding="utf-8") as file_entities:
        for entity in entities:
            file_entities.write(f"{entity['type']}: {entity['text']}\n")
    with open(output_file_relations + "_" + str(int(time.time())) + ".txt", "w", encoding="utf-8") as file_relations:
        for relation in relations:
            file_relations.write(f"{relation}\n")

# 示例使用
bio_file = "1222_bio_output.txt"
output_file_entities = "entities.txt"
output_file_relations = "relations.txt"
bio_to_entities_relations(bio_file, output_file_entities, output_file_relations)
print("实体已保存到", output_file_entities)
print("关系已保存到", output_file_relations)

实体已保存到 entities.txt
关系已保存到 relations.txt

# 本体框架示例
# 使用简单的 JSON 结构描述本体：

{
  "entities": {
    "person": {"attributes": ["name", "role"]},
    "time": {"attributes": ["date", "context"]},
    "location": {"attributes": ["name", "region"]},
    "org": {"attributes": ["name", "type"]},
    "thing": {"attributes": ["description", "time"]}
  },
  "relations": [
    {"name": "参与", "domain": "person", "range": "thing"},
    {"name": "指挥", "domain": "person", "range": "thing"},
    {"name": "发生于", "domain": "thing", "range": "time"},
    {"name": "发生在", "domain": "thing", "range": "location"},
    {"name": "位于", "domain": "org", "range": "location"}
  ]
}

定义的本体： {'concepts': ['person', 'event', 'time', 'location'], 'relations': ['participated_in', 'happened_in', 'occurred_on'], 'attributes': {'person': ['name', 'role'], 'event': ['name', 'description'], 'time': ['date'], 'location': ['name', 'region']}}

# 查询示例
for relation in relations:    
    print(f"{relation['subject']} {relation['predicate']} {relation['object']}")

唐肇华 参与 到桂师工作
到桂师工作 发生于 1940年2月
到桂师工作 发生在 灵川

自定义停用词¶

jieba分词¶

调整自定义词库¶

实体类型分析后的结果¶