cedict2csv-ocr.js 2.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899
  1. var fs = require("fs");
  2. var readline = require("readline");
  3. var Stream = require("stream");
  4. // librairie de traitement du pinyin
  5. const pinyinizer = require("pinyinizer");
  6. // utilitaires
  7. var tab = "\t";
  8. var endLine = "\n";
  9. var ligne = {
  10. hanzi: "",
  11. traditional: "",
  12. pinyin: "",
  13. translations: []
  14. };
  15. function resetLigne() {
  16. ligne.hanzi = "";
  17. ligne.traditional = "";
  18. ligne.pinyin = "";
  19. ligne.translations = [];
  20. }
  21. // la ligne CSV qu'on va écrire dans le fichier CSV
  22. var ligneCSV = "";
  23. function readFileLineByLine(inputFile, outputFile) {
  24. var instream = fs.createReadStream(inputFile);
  25. var outstream = new Stream();
  26. outstream.readable = true;
  27. outstream.writable = true;
  28. var rl = readline.createInterface({
  29. input: instream,
  30. output: outstream,
  31. terminal: false
  32. });
  33. rl.on("line", function(line) {
  34. if (line[0] != "#") {
  35. chinois = line.split(" ", 2);
  36. // console.log(chinois[0], " ## ", chinois[1]);
  37. ligne.traditional = chinois[0];
  38. ligne.hanzi = chinois[1];
  39. i = line.search(/\[/);
  40. j = line.search(/\]/);
  41. phonetic = line.substr(i + 1, j - i - 1);
  42. // traitement des champs pinyin avec des chiffres
  43. phonetic = phonetic.toLowerCase();
  44. // on va tester ça ...
  45. try {
  46. var sauve = phonetic;
  47. phonetic = pinyinizer.pinyinize(phonetic);
  48. phonetic = phonetic.replace(/5/g, "");
  49. } catch (err) {
  50. console.log("Erreur: " + pinyin);
  51. phonetic = sauve;
  52. }
  53. ligne.pinyin = phonetic;
  54. // console.log(phonetic);
  55. // mettre en pinyin accentué les parties entre crochets
  56. ligne.translations = line.split(/\//g);
  57. var l = ligne.translations.length;
  58. ligne.translations.pop();
  59. ligne.translations.shift();
  60. var temp = ligne.translations.join(" / ");
  61. var tableau = temp.match(/\[.*?\]/g);
  62. if (tableau != null) {
  63. tableau.forEach(function(t) {
  64. t = t.replace("u:","v");
  65. var t1 = pinyinizer.pinyinize(t);
  66. //console.log(t, "-", t1);
  67. temp = temp.replace(t, t1);
  68. temp = temp.replace("5","");
  69. //console.log(temp);
  70. });
  71. }
  72. // console.log(l);
  73. // console.log(ligne.translations);
  74. // console.log(temp);
  75. ligne = line;
  76. ligne.pinyin = ligne.pinyin.replace(" ","")
  77. ligneCSV =
  78. ligne.pinyin +
  79. endLine;
  80. console.log(ligneCSV);
  81. fs.appendFileSync(outputFile, ligneCSV);
  82. resetLigne();
  83. }
  84. });
  85. }
  86. readFileLineByLine("cedict.txt", "cedict-ocr.csv");