# -*- coding: utf-8 -*-
"""Created on Wed Jun 28 18:42:33 2017"""import reimport numpy as np'''该程序实现对giza++后的对齐双语平行语料抽取对齐词汇关系建立源语言到目标语言的映射矩阵,编号从0开始,将对齐文件中的NULL当作第一个词语如果词语之间存在对齐关系,则将对齐矩阵matrixST[s][t]位置值设置为1,其它为0'''def alig_pairs(filepath): matrixZeroOne = [] pattern1 = re.compile(r' \(\{([0-9 ]*)\}\) ?') # print(pattern1) f = open(filepath,'r')#,encoding='utf-8') line=f.readline() #matrix = np.zeros() while(True): if not line: break target = f.readline().strip().split() source = f.readline().strip() #match= pattern1.findall(source) # 使用Pattern匹配文本,获得匹配结果,无法匹配时将返回None source_word = pattern1.split(source) # print(source_word) s_l = len(source_word)//2-1#-1不考虑null t_l = len(target) #print(s_l) #print(t_l) matrixTS = np.zeros((t_l,s_l)) #print(matrixST.shape) #从null开始对齐i=0。如果不考虑null,从第二位开始,i=2 i=2 while( i < len(source_word)-2): index = source_word[i+1] if index != '' and index !=' ': s = index.strip().split() # print(s) for s_ind in s: #设置对齐矩阵 matrixTS[int(s_ind)-1][int((i-2))//2]=1 #print(i//2-1) #该语句抽取对齐词语队 #print(source_word[int(i)],target[int(s_ind)-1]) i+=2 # print(matrixTS) matrixZeroOne.append(matrixTS) # print(matrixTS.shape) #因为对齐这个矩阵是动态生成的,所以在这里进行矩阵的合并 #print(i) #print(source_word) #if match: #print (match) #print ('yes') line=f.readline() #print(target) #print(source) f.close() return matrixZeroOne#alig_pairs('test.txt')#alig_pairs('117-06-28.183340.lmt.A3.final')-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------# -*- coding: utf-8 -*-
import codecsdef get_matrix(): #print('程序进入process') Chinese = codecs.open("result/result_cn",'r',encoding = 'utf-8') English = codecs.open('result/result_en', 'r', encoding = 'utf-8') # result_eng = codecs.open('result/swap_en', 'w', encoding = 'utf-8') # result_chi = codecs.open('result/swap_cn', 'w', encoding = 'utf-8') # eng_chi = codecs.open('result/en_to_cn','w',encoding = 'utf-8') english_sentence_count = 0 chinese_sentence_count = 0 chinese_word = [] chinese_sentence = [] for line in Chinese.readlines(): pair = line.strip().split() if len(pair) == 4: swap = pair[1] pair[1] = pair[2] pair[2] = swap s = pair[0] + " " + pair[1] + " " + pair[2] + " " + pair[3] chinese_word.append(s) # result_chi.write(pair[0] + " " + pair[1] + " " + pair[2] + " " + pair[3] + "\n") if len(pair) == 0: chinese_sentence.append(chinese_word) chinese_word = [] # result_chi.write("\n") chinese_sentence_count += 1 english_word = [] english_sentence = [] for line in English.readlines(): pair = line.strip().split() if len(pair) == 4: swap = pair[1] pair[1] = pair[2] pair[2] = swap s = pair[0] + " " + pair[1] + " " + pair[2] + " " + pair[3] english_word.append(s) # result_eng.write(pair[0] + " " + pair[1] + " " + pair[2] + " " + pair[3] + "\n") if len(pair) == 0: english_sentence.append(english_word) english_word = [] # result_eng.write("\n") english_sentence_count += 1 if english_sentence_count < chinese_sentence_count: min_count = english_sentence_count else: min_count = chinese_sentence_count matrix = [] if len(english_sentence) == len(chinese_sentence): i = 0 while i < len(english_sentence): chinese_sentence_length = len(chinese_sentence[i]) english_sentence_length = len(english_sentence[i])#获得当前句子的行列值 english_chinese = [["0" for col in range(english_sentence_length + 1)] for row in range(chinese_sentence_length + 1)] col = 1 while col <= english_sentence_length: english_chinese[0][col] = english_sentence[i][col - 1] col += 1 row = 1 while row <= chinese_sentence_length: english_chinese[row][0] = chinese_sentence[i][row - 1] row += 1 # for row in range(chinese_sentence_length): # for col in range(english_sentence_length): # eng_chi.write(english_chinese[row][col] + " ") # eng_chi.write("\n") # eng_chi.write("\n") #每次放进去的矩阵,其实规模是不一样大的 matrix.append(english_chinese) i = i + 1 else: print('error') # for j in range(len(matrix)): # for row in range(len(matrix[j])): # s = "" # for col in range(len(matrix[j][row])): # s += matrix[j][row][col] # s += " " # print(s) return matrix, chinese_sentence#matrix,_ = get_matrix() -----------------------------------------------------------------------------------------------------------------------------------------------------------------------#-*-coding:utf-8-*-
import os import string def count(filepath): total = 0 #总行数 countPound = 0 #注释行数 countBlank = 0 #空行数 line = open(filepath,'r')#,encoding='utf-8') for li in line.readlines(): #readlines()一次性读完整个文件 total += 1 if not li.split(): #判断是否为空行 countBlank +=1 li.strip() if li.startswith('#'): countPound += 1 print(file) print("countBlank:%d" % countBlank) print("countPound:%d" % countPound) print("total:%d" % total) count('result_cn')-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
#-*-coding:utf-8-*-
def bijiao(): f1=open('lmt.txt','r') f2=open('lh.txt','r') count=0 #统计行数 dif=[] #统计不同的数量序列 for a in f1: b=f2.readline() count+=1 if a!=b: dif.append(count) f1.close() f2.close() return dif c=bijiao() if c==0: print('两个文件一样!') else: print('有%d处不同'% len(c)) for each in d: print('%d行不一样'% each)