mod7-dia.py 4.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140
  1. # Program to complete the grammar CSV (tab separated) file with audio and translations
  2. # Licence: MIT
  3. # Copyrights : Eric Streit <eric@yojik.eu> 2022
  4. # adding pinyin to the file
  5. # adding end of line (1-1-FSI-Chinese)
  6. # dumping all Hanzi parts into a hanzi.txt file
  7. # Traitement du Module 7 de FSI CHinese
  8. #
  9. # colonnes originales:
  10. #
  11. # entrée:
  12. # lettre pinyinOriginal hanziOriginal anglais
  13. #
  14. # sortie:
  15. # lettre pinyinOriginal hanziCorrige anglais module lesson origine
  16. #
  17. # fichier avec le pinyin en 2 lignes pour comparer
  18. #
  19. # * Corriger la ponctuation des colonnes 5 (!;,.?) et espaces dans les ponctuations
  20. # * transformer les hanziOriginal en pinyin
  21. # * Ajouter chanps module lesson origine
  22. # parameters:
  23. # * name of the file to deal with
  24. # * name of the unit (to be added at the end of each line)
  25. # 5 original fields: number, English1, hanzi1,English2,hanzi2
  26. # the CSV library
  27. import csv
  28. from typing import TYPE_CHECKING, Generator
  29. # the library to split the Chinese sentences into words
  30. import jieba
  31. # the pinyin library
  32. from pypinyin import pinyin, lazy_pinyin, Style
  33. from xpinyin import Pinyin
  34. from opencc import OpenCC
  35. # hanzi tokenizer
  36. from chinese import ChineseAnalyzer
  37. # os module
  38. import os
  39. # shutils (moving files)
  40. import shutil
  41. # random
  42. import random,copy, re, sys
  43. # google TTS
  44. from GoogleTTS import GoogleTTS
  45. # delay
  46. import time
  47. #
  48. theFileName = '' # the file name to work with
  49. theFileName = str(sys.argv[1])
  50. print ('theFileName : ',theFileName)
  51. theFileNamePrefix = os.path.dirname(theFileName)
  52. print ('theFileNamePrefix : ',theFileNamePrefix)
  53. theUnit = str(sys.argv[2])
  54. print ('the Unit : ', theUnit)
  55. theModule = 7
  56. print (theModule)
  57. theOrigin = "FSI-Chinese"
  58. print (theOrigin)
  59. theNumber = theFileName.split('.')[0][-1]
  60. print ('the number : ', theNumber)
  61. theOutputFileName = os.path.join(theFileNamePrefix, 'FSI-' + theUnit + '-dialogue-'+ theNumber + '.simp.csv') # the output filename according to the input filename
  62. print ('theOutputFileName : ',theOutputFileName)
  63. #
  64. baseDirectory = os.getcwd()
  65. print (' Base directory : ', baseDirectory)
  66. # le fichier de comparaison
  67. thePinyinFile = os.path.join(theFileNamePrefix, 'dialogue-' + theNumber + '.txt')
  68. # initialisation du tokenizer Hanzi
  69. analyzer = ChineseAnalyzer()
  70. # initialisation du convertisseur Hanzi Pinyin
  71. hanConvert = Pinyin()
  72. #initialisation de opencc
  73. cc = OpenCC('t2s')
  74. with open(theOutputFileName,'w') as o:
  75. fieldNames = ['lettre', 'pinyinOriginal', 'hanziCorrige', 'anglais', 'module', 'lesson', 'origine']
  76. csvwriter = csv.DictWriter(o, fieldnames=fieldNames,delimiter='\t')
  77. with open(thePinyinFile,'w') as t:
  78. print('pinyin comparaison text file opened')
  79. with open(theFileName,'r') as f:
  80. print('csv file opened')
  81. inputFieldsNames = ['lettre', 'pinyinOriginal', 'hanziOriginal', 'anglais']
  82. data = csv.DictReader(f, delimiter='\t',fieldnames=inputFieldsNames)
  83. finalRow = {}
  84. theLineNumber = 0
  85. for row in data:
  86. theLineNumber = theLineNumber + 1
  87. print ('theLineNumber : ',theLineNumber)
  88. # Traitement du numéro
  89. finalRow['lettre'] = row['lettre']
  90. finalRow['pinyinOriginal'] = row['pinyinOriginal']
  91. # opencc : converted = cc.convert(to_convert)
  92. # transformation de traditional à simplified
  93. hanziOriginal = cc.convert(row['hanziOriginal'])
  94. # correction du hanzi
  95. theHanziCorrige = hanziOriginal.replace('.','。').replace('?','?').replace(',',',').replace('!','!')
  96. # on separe les phrases en mots
  97. result = analyzer.parse(theHanziCorrige)
  98. tabHanzi = result.tokens()
  99. hanzitokenized = ' '.join(str(x) for x in tabHanzi)
  100. finalRow['hanziCorrige'] = theHanziCorrige
  101. # generation du pinyin à partir du hanziCorrige tokénisé
  102. thePinyinGenere = hanConvert.get_pinyin(hanzitokenized, '', tone_marks='marks').replace('。','.').replace(',',', ').replace('?','?').replace('!','!').capitalize()
  103. finalRow['anglais'] = row['anglais']
  104. finalRow['module'] = theModule
  105. finalRow['lesson'] = theUnit
  106. finalRow['origine'] = theOrigin
  107. csvwriter.writerow(finalRow)
  108. # writing the pinyin comparaison file
  109. t.write(finalRow['pinyinOriginal'] + '\n')
  110. t.write(thePinyinGenere + '\n\n')
  111. f.close()
  112. t.close()
  113. o.close()