Mod7.py 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138
  1. # Program to complete the grammar CSV (tab separated) file with audio and translations
  2. # Licence: MIT
  3. # Copyrights : Eric Streit <eric@yojik.eu> 2022
  4. # adding pinyin to the file
  5. # adding end of line (1-1-FSI-Chinese)
  6. # dumping all Hanzi parts into a hanzi.txt file
  7. # Traitement du Module 7 de FSI CHinese
  8. #
  9. # colonnes originales:
  10. #
  11. # entrée:
  12. # numero lettre pinyinOriginal hanziOriginal anglais
  13. #
  14. # sortie:
  15. # numero lettre pinyinOriginal pinyinGenere hanziOriginal hanziCorrige anglais module lesson origine
  16. #
  17. # * Corriger la ponctuation des colonnes 5 (!;,.?) et espaces dans les ponctuations
  18. # * transformer les hanziOriginal en pinyin
  19. # * Ajouter chanps module lesson origine
  20. # parameters:
  21. # * name of the file to deal with
  22. # * name of the unit (to be added at the end of each line)
  23. # 5 original fields: number, English1, hanzi1,English2,hanzi2
  24. # the CSV library
  25. import csv
  26. from typing import TYPE_CHECKING, Generator
  27. # the library to split the Chinese sentences into words
  28. import jieba
  29. # the pinyin library
  30. from pypinyin import pinyin, lazy_pinyin, Style
  31. from xpinyin import Pinyin
  32. from opencc import OpenCC
  33. # hanzi tokenizer
  34. from chinese import ChineseAnalyzer
  35. # os module
  36. import os
  37. # shutils (moving files)
  38. import shutil
  39. # random
  40. import random,copy, re, sys
  41. # google TTS
  42. from GoogleTTS import GoogleTTS
  43. # delay
  44. import time
  45. #
  46. theFileName = '' # the file name to work with
  47. theFileName = str(sys.argv[1])
  48. print ('theFileName : ',theFileName)
  49. theFileNamePrefix = os.path.dirname(theFileName)
  50. print ('theFileNamePrefix : ',theFileNamePrefix)
  51. theUnit = str(sys.argv[2])
  52. print ('the Unit : ', theUnit)
  53. theModule = 7
  54. print (theModule)
  55. theOrigin = "FSI-Chinese"
  56. print (theOrigin)
  57. theOutputFileName = os.path.join(theFileNamePrefix, 'FSI-' + theUnit + '-frames.simp.csv') # the output filename according to the input filename
  58. print ('theOutputFileName : ',theOutputFileName)
  59. #
  60. baseDirectory = os.getcwd()
  61. print (' Base directory : ', baseDirectory)
  62. # le fichier de comparaison
  63. thePinyinFile = os.path.join(theFileNamePrefix, 'pinyin.txt')
  64. # initialisation du tokenizer Hanzi
  65. analyzer = ChineseAnalyzer()
  66. # initialisation du convertisseur Hanzi Pinyin
  67. hanConvert = Pinyin()
  68. #initialisation de opencc
  69. cc = OpenCC('t2s')
  70. with open(theOutputFileName,'w') as o:
  71. fieldNames = ['numero', 'lettre', 'pinyinOriginal','hanziCorrige', 'anglais', 'module', 'lesson', 'origine']
  72. csvwriter = csv.DictWriter(o, fieldnames=fieldNames,delimiter='\t')
  73. with open(thePinyinFile,'w') as t:
  74. print('pinyin comparaison text file opened')
  75. with open(theFileName,'r') as f:
  76. print('csv file opened')
  77. inputFieldsNames = ['numero', 'lettre', 'pinyinOriginal', 'hanziOriginal', 'anglais']
  78. data = csv.DictReader(f, delimiter='\t',fieldnames=inputFieldsNames)
  79. finalRow = {}
  80. theLineNumber = 0
  81. for row in data:
  82. theLineNumber = theLineNumber + 1
  83. print ('theLineNumber : ',theLineNumber)
  84. # Traitement du numéro
  85. finalRow['numero'] = row['numero']
  86. finalRow['lettre'] = row['lettre']
  87. finalRow['pinyinOriginal'] = row['pinyinOriginal']
  88. # opencc : converted = cc.convert(to_convert)
  89. # transformation de traditional à simplified
  90. hanziOriginal = cc.convert(row['hanziOriginal'])
  91. # correction du hanzi
  92. theHanziCorrige = hanziOriginal.replace('.','。').replace('?','?').replace(',',',').replace('!','!')
  93. # on separe les phrases en mots
  94. result = analyzer.parse(theHanziCorrige)
  95. tabHanzi = result.tokens()
  96. hanzitokenized = ' '.join(str(x) for x in tabHanzi)
  97. finalRow['hanziCorrige'] = theHanziCorrige
  98. # generation du pinyin à partir du hanziCorrige tokénisé
  99. thePinyinGenere = hanConvert.get_pinyin(hanzitokenized, '', tone_marks='marks').replace('。','.').replace(',',', ').replace('?','?').replace('!','!').capitalize()
  100. # finalRow['pinyinGenere'] = thePinyinGenere
  101. finalRow['anglais'] = row['anglais']
  102. finalRow['module'] = theModule
  103. finalRow['lesson'] = theUnit
  104. finalRow['origine'] = theOrigin
  105. csvwriter.writerow(finalRow)
  106. # writing the pinyin comparaison file
  107. t.write(finalRow['pinyinOriginal'] + '\n')
  108. t.write(thePinyinGenere + '\n\n')
  109. f.close()
  110. t.close()
  111. o.close()