123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138 |
- # Program to complete the grammar CSV (tab separated) file with audio and translations
- # Licence: MIT
- # Copyrights : Eric Streit <eric@yojik.eu> 2022
- # adding pinyin to the file
- # adding end of line (1-1-FSI-Chinese)
- # dumping all Hanzi parts into a hanzi.txt file
- # Traitement du Module 7 de FSI CHinese
- #
- # colonnes originales:
- #
- # entrée:
- # numero lettre pinyinOriginal hanziOriginal anglais
- #
- # sortie:
- # numero lettre pinyinOriginal pinyinGenere hanziOriginal hanziCorrige anglais module lesson origine
- #
- # * Corriger la ponctuation des colonnes 5 (!;,.?) et espaces dans les ponctuations
- # * transformer les hanziOriginal en pinyin
- # * Ajouter chanps module lesson origine
- # parameters:
- # * name of the file to deal with
- # * name of the unit (to be added at the end of each line)
- # 5 original fields: number, English1, hanzi1,English2,hanzi2
- # the CSV library
- import csv
- from typing import TYPE_CHECKING, Generator
- # the library to split the Chinese sentences into words
- import jieba
- # the pinyin library
- from pypinyin import pinyin, lazy_pinyin, Style
- from xpinyin import Pinyin
- from opencc import OpenCC
- # hanzi tokenizer
- from chinese import ChineseAnalyzer
- # os module
- import os
- # shutils (moving files)
- import shutil
- # random
- import random,copy, re, sys
- # google TTS
- from GoogleTTS import GoogleTTS
- # delay
- import time
- #
- theFileName = '' # the file name to work with
- theFileName = str(sys.argv[1])
- print ('theFileName : ',theFileName)
- theFileNamePrefix = os.path.dirname(theFileName)
- print ('theFileNamePrefix : ',theFileNamePrefix)
- theUnit = str(sys.argv[2])
- print ('the Unit : ', theUnit)
- theModule = 7
- print (theModule)
- theOrigin = "FSI-Chinese"
- print (theOrigin)
- theOutputFileName = os.path.join(theFileNamePrefix, 'FSI-' + theUnit + '-frames.simp.csv') # the output filename according to the input filename
- print ('theOutputFileName : ',theOutputFileName)
- #
- baseDirectory = os.getcwd()
- print (' Base directory : ', baseDirectory)
- # le fichier de comparaison
- thePinyinFile = os.path.join(theFileNamePrefix, 'pinyin.txt')
- # initialisation du tokenizer Hanzi
- analyzer = ChineseAnalyzer()
- # initialisation du convertisseur Hanzi Pinyin
- hanConvert = Pinyin()
- #initialisation de opencc
- cc = OpenCC('t2s')
- with open(theOutputFileName,'w') as o:
- fieldNames = ['numero', 'lettre', 'pinyinOriginal','hanziCorrige', 'anglais', 'module', 'lesson', 'origine']
- csvwriter = csv.DictWriter(o, fieldnames=fieldNames,delimiter='\t')
- with open(thePinyinFile,'w') as t:
- print('pinyin comparaison text file opened')
- with open(theFileName,'r') as f:
- print('csv file opened')
- inputFieldsNames = ['numero', 'lettre', 'pinyinOriginal', 'hanziOriginal', 'anglais']
- data = csv.DictReader(f, delimiter='\t',fieldnames=inputFieldsNames)
-
- finalRow = {}
- theLineNumber = 0
-
- for row in data:
- theLineNumber = theLineNumber + 1
- print ('theLineNumber : ',theLineNumber)
- # Traitement du numéro
- finalRow['numero'] = row['numero']
- finalRow['lettre'] = row['lettre']
- finalRow['pinyinOriginal'] = row['pinyinOriginal']
- # opencc : converted = cc.convert(to_convert)
- # transformation de traditional à simplified
- hanziOriginal = cc.convert(row['hanziOriginal'])
- # correction du hanzi
- theHanziCorrige = hanziOriginal.replace('.','。').replace('?','?').replace(',',',').replace('!','!')
- # on separe les phrases en mots
- result = analyzer.parse(theHanziCorrige)
- tabHanzi = result.tokens()
- hanzitokenized = ' '.join(str(x) for x in tabHanzi)
- finalRow['hanziCorrige'] = theHanziCorrige
- # generation du pinyin à partir du hanziCorrige tokénisé
- thePinyinGenere = hanConvert.get_pinyin(hanzitokenized, '', tone_marks='marks').replace('。','.').replace(',',', ').replace('?','?').replace('!','!').capitalize()
- # finalRow['pinyinGenere'] = thePinyinGenere
-
- finalRow['anglais'] = row['anglais']
- finalRow['module'] = theModule
- finalRow['lesson'] = theUnit
- finalRow['origine'] = theOrigin
- csvwriter.writerow(finalRow)
- # writing the pinyin comparaison file
- t.write(finalRow['pinyinOriginal'] + '\n')
- t.write(thePinyinGenere + '\n\n')
- f.close()
- t.close()
- o.close()
|