6.支持Linux、Mac系统
[entities] person act like hate [relations] Act-Person Arg1:<ENTITY>, Arg2:<ENTITY> [events] [attributes]这里我只需要对实体和关系进行标注,比如我需要标注实体:人物,动作(包含喜欢/讨厌);关系:人物和动作的关系 当我们在网页server端打开/mydata/1.txt后,可以如下图开始标注了
[labels] person | 人名 like | 喜欢 hate | 讨厌 [drawing] SPAN_DEFAULT fgColor:black, bgColor:lightgreen, borderColor:darken ARC_DEFAULT color:black, arrowHead:triangle-5 person bgColor:yellow, borderColor:red like bgColor:green, borderColor:red hate bgColor:gray, borderColor:red接下来,就会在标注的页面看到如下所示了。
import os import sys def read_file(file_path): with open(file_path, 'r',encoding='utf8') as file: lines = file.readlines() return lines def write_file(file_path, lines): with open(file_path, 'w',encoding='utf8') as file: for line in lines: file.write(line) def check_one_file(conll_path, ann_path): # check process: 针对conll文件进行检查B开头的字符是否通过brat转化后是正确的 ann_lines = read_file(ann_path) conll_lines = read_file(conll_path) begin_words, begin_index = [], [] for ann_i in ann_lines: class_i, start_i, end_i = ann_i.split('\t')[1].split(' ') begin_words.append('B-' + class_i) begin_index.append(start_i) ann_zip = dict(zip(begin_index, begin_words)) # check new_conll = [] for conll_i in conll_lines: if conll_i == '\n': new_conll.append(conll_i) else: tag_i, start_i, end_i, word_i = conll_i.split('\t') if start_i in begin_index: if tag_i != ann_zip[start_i]: new_conll.append(ann_zip[start_i] + '\t' + start_i + '\t' + end_i + '\t' + word_i) else: new_conll.append(conll_i) else: new_conll.append(conll_i) return new_conll if __name__ == '__main__': sub_dir = sys.argv[1] # sub_dir = '../data/fd' # 生成一个保存新conll文件的文件夹 new_dir = os.path.join(sub_dir,'conll') if not os.path.exists(new_dir): os.mkdir(new_dir) path_pre = [] for path_i in os.listdir(sub_dir): if path_i.endswith('.ann'): path_pre.append(path_i.split('.ann')[0]) for path_pre_i in path_pre: ann_path_i = os.path.join(sub_dir,f'{path_pre_i}.ann') conll_path_i = os.path.join(sub_dir,f'{path_pre_i}.conll') new_conll_i = check_one_file(conll_path_i,ann_path_i) new_path_i = os.path.join(new_dir,f'{path_pre_i}.conll') write_file(new_path_i,new_conll_i) print(f'Write {conll_path_i} completed!')3.3 实体BIOE输出
def bioe_transfer(tags:List[List[str]]): """ 将BIO格式转成BIOE格式 """ new_tags_all = [] for tags_i in tags: new_tags = [] for i in range(len(tags_i) - 1): tag_i = tags_i[i] tag_after = tags_i[i + 1] if tag_i == 'O': new_tags.append(tag_i) else: if tag_i.split('-')[0] == 'B': new_tags.append(tag_i) else: if tag_i == tag_after: new_tags.append(tag_i) else: # 最后一个 new_tags.append('E-' + tag_i.split('-')[1]) if tags_i[-1] == tags_i[-2]: if tags_i[-1] == 'O': new_tags.append('O') else: new_tags.append('E-' + tags_i[-1].split('-')[1]) else: new_tags.append(tags_i[-1]) new_tags_all.append(new_tags) return new_tags_all3.4 关系结果输出(干货)
import pandas as pd import numpy as np import sys # 堆代码 duidaima.com if __name__ == '__main__': path_txt = sys.argv[1] line_start = [] line_end = [] line_len = [] with open(path_txt,'r',encoding='utf8') as txt_f: txt_lines = txt_f.readlines() num = 0 for txt_i in txt_lines: txt_i = txt_i.split('\n')[0] line_len.append(len(txt_i)) if num == 0: start_i = num else: start_i = num + 1 line_start.append(start_i) num = start_i + len(txt_i) line_end.append(num) rel_label = [] rel_index_1 = [] rel_index_2 = [] ent_index = [] ent_label = [] ent_start = [] ent_end = [] ent_words = [] with open(path_txt.split('.txt')[0] + '.ann','r',encoding='utf8') as ann_f: ann_lines = ann_f.readlines() for ann_i in ann_lines: e_r_i = ann_i.split('\t')[0] if 'R' in e_r_i: rel_i = ann_i.split('\t')[1].split(' ') rel_label.append(rel_i[0]) rel_index_1.append(rel_i[1].split(':')[1]) rel_index_2.append(rel_i[2].split(':')[1]) else: rs_ent = ann_i.split('\t') ent_index.append(rs_ent[0]) ent_words.append(rs_ent[2].strip()) rs_ent_in = rs_ent[1].split(' ') ent_label.append(rs_ent_in[0]) ent_start.append(int(rs_ent_in[1])) ent_end.append(int(rs_ent_in[2]) - 1) df_ent = pd.DataFrame({'index':ent_index,'words':ent_words,'label':ent_label,'start':ent_start,'end':ent_end}) df_rel = pd.DataFrame({'label':rel_label,'ent_1':rel_index_1,'ent_2':rel_index_2}) df_line = pd.DataFrame({'txt':txt_lines,'start':line_start,'end':line_end}) ent_1_label = [] ent_1_words = [] ent_1_start = [] ent_1_end = [] ent_2_label = [] ent_2_words = [] ent_2_start = [] ent_2_end = [] txt_rep = [] txt_rep_start = [] for i in range(len(df_rel)): ent_1_i = df_rel.loc[i,'ent_1'] ent_2_i = df_rel.loc[i,'ent_2'] ent_1_label.append(np.array(df_ent.loc[df_ent['index'] == ent_1_i,'label'])[0]) ent_1_words.append(np.array(df_ent.loc[df_ent['index'] == ent_1_i,'words'])[0]) ent_1_start.append(np.array(df_ent.loc[df_ent['index'] == ent_1_i,'start'])[0]) ent_1_end.append(np.array(df_ent.loc[df_ent['index'] == ent_1_i,'end'])[0]) ent_2_label.append(np.array(df_ent.loc[df_ent['index'] == ent_2_i, 'label'])[0]) ent_2_words.append(np.array(df_ent.loc[df_ent['index'] == ent_2_i, 'words'])[0]) ent_2_start.append(np.array(df_ent.loc[df_ent['index'] == ent_2_i, 'start'])[0]) ent_2_end.append(np.array(df_ent.loc[df_ent['index'] == ent_2_i, 'end'])[0]) # 匹配文档 ent_1_index = np.array(df_ent.loc[df_ent['index'] == ent_1_i, 'start'])[0] df_line_i = df_line[(df_line['start'] <= ent_1_index) & (df_line['end'] >= ent_1_index)] txt_rep_i = np.array(df_line_i['txt'])[0] txt_rep_start_i = np.array(df_line_i['start'])[0] txt_rep.append(txt_rep_i) txt_rep_start.append(txt_rep_start_i) df_rel['ent_1_label'] = ent_1_label df_rel['ent_1_words'] = ent_1_words df_rel['ent_1_start'] = ent_1_start df_rel['ent_1_end'] = ent_1_end df_rel['ent_2_label'] = ent_2_label df_rel['ent_2_words'] = ent_2_words df_rel['ent_2_start'] = ent_2_start df_rel['ent_2_end'] = ent_2_end df_rel['txt'] = txt_rep df_rel['txt_start'] = txt_rep_start df_rel.drop(['ent_1','ent_2'],axis=1,inplace=True) csv_path = path_txt.split('.txt')[0] + '.csv' df_rel.to_csv(csv_path, index=False) print(f'Saved in {csv_path}!')3.5 模型预测结果展示为brat(干货)