ref: 91b74e4777120478cf658bad19323e1c147b0449
dir: /desereter.ml/
let entry_rgx = Str.regexp "\\([.a-z'-]+\\)(?[0-9]?)? \\(.*\\)"
type wordset = {prefix: string; word: string; suffix: string}
let is_uppercase = function
| 'A'
|'B'
|'C'
|'D'
|'E'
|'F'
|'G'
|'H'
|'I'
|'J'
|'K'
|'L'
|'M'
|'N'
|'O'
|'P'
|'Q'
|'R'
|'S'
|'T'
|'U'
|'V'
|'W'
|'X'
|'Y'
|'Z' ->
true
| _ -> false
let get_vowel vowel =
if String.length vowel != 3 then ("", "")
else
let trimmed = String.sub vowel 0 2 in
match trimmed with
| "IY" -> ("𐐀", "𐐨")
| "EY" -> ("𐐁", "𐐩")
| "AA" -> ("𐐂", "𐐪")
| "AO" -> ("𐐉", "𐐱")
| "OW" -> ("𐐄", "𐐬")
| "UW" -> ("𐐅", "𐐭")
| "IH"
|"IX" ->
("𐐆", "𐐮")
| "EH" -> ("𐐇", "𐐯")
| "AE" -> ("𐐈", "𐐰")
| "AX" -> ("𐐉", "𐐱")
| "AH" -> ("𐐊", "𐐲")
| "UH" -> ("𐐋", "𐐳")
| "AY" -> ("𐐌", "𐐴")
| "AW" -> ("𐐍", "𐐵")
| "ER" -> ("𐐊𐐡", "𐐊𐑉")
| _ -> ("", "")
let get_char = function
| "W" -> ("𐐎", "𐐶")
| "Y" -> ("𐐏", "𐐷")
| "H"
|"HH" ->
("𐐐", "𐐸")
| "P" -> ("𐐑", "𐐹")
| "B" -> ("𐐒", "𐐺")
| "T" -> ("𐐓", "𐐻")
| "D" -> ("𐐔", "𐐼")
| "CH" -> ("𐐕", "𐐽")
| "JH" -> ("𐐖", "𐐾")
| "K" -> ("𐐗", "𐐿")
| "G" -> ("𐐘", "𐑀")
| "F" -> ("𐐙", "𐑁")
| "V" -> ("𐐚", "𐑂")
| "TH" -> ("𐐛", "𐑃")
| "DH" -> ("𐐜", "𐑄")
| "S" -> ("𐐝", "𐑅")
| "Z" -> ("𐐞", "𐑆")
| "SH" -> ("𐐟", "𐑇")
| "ZH" -> ("𐐠", "𐑈")
| "R" -> ("𐐡", "𐑉")
| "L" -> ("𐐢", "𐑊")
| "M" -> ("𐐣", "𐑋")
| "N" -> ("𐐤", "𐑌")
| "NX"
|"NG" ->
("𐐥", "𐑍")
| v -> get_vowel v
let rec parse_arpabet line des uppercase =
match line with
| hd :: tl ->
let u, l = get_char hd in
(if uppercase then u else l) ^ parse_arpabet tl des false
| [] -> des
let unquoted word = String.sub word 1 (String.length word - 2)
let consider word =
let wrd = ref (String.lowercase_ascii word) in
let prefix = ref "" in
let suffix = ref "" in
( try
let pos = Str.search_forward (Str.regexp "[({\"]") !wrd 0 + 1 in
wrd := String.sub word pos (String.length !wrd - pos) ;
prefix := String.sub word 0 pos
with Not_found -> () ) ;
( try
let pos = Str.search_backward (Str.regexp "[})\"\\.,!;:]") !wrd (String.length !wrd) in
suffix := String.sub !wrd pos (String.length !wrd - pos) ;
wrd := String.sub !wrd 0 pos;
with Not_found -> () ) ;
{prefix= !prefix; word= !wrd; suffix= !suffix}
let parse word dictionary =
let uppercase = is_uppercase word.[0] in
let wordparts = consider word in
try
let des = parse_arpabet (String.split_on_char ' ' (Hashtbl.find dictionary wordparts.word)) "" uppercase in
wordparts.prefix ^ des ^ wordparts.suffix
with Not_found -> word
let sanitize line = Str.global_replace (Str.regexp "\\.\\.\\.") " ... " line
let load_dictionary extra =
let default =
try
let prefix = Unix.getenv "OPAM_SWITCH_PREFIX" in
prefix ^ "/share/desereter/cmudict.dict"
with Not_found -> "/lib/cmudict.dict"
in
let dictionaries = [default] @ String.split_on_char ';' extra in
let dictionary = Hashtbl.create 150000 in
let load file =
if String.length file > 0 then
let ic = open_in file in
try
while true do
let entry = input_line ic in
if Str.string_match entry_rgx entry 0 then
let word = Str.matched_group 1 entry in
let pronunciation = Str.matched_group 2 entry in
Hashtbl.add dictionary word pronunciation
done
with End_of_file -> close_in ic in
List.iter load dictionaries ; dictionary
let translate dictionary line =
let words = String.split_on_char ' ' (sanitize line) in
let words =
List.filter
(fun x ->
let w = String.trim x in
String.length w > 0 )
words in
print_endline (List.fold_left (fun acc word -> acc ^ parse word dictionary ^ " ") "" words)
let () =
let line = ref "" in
let extra = ref "" in
Arg.parse
[(* ("-i", Arg.Set_string line, "input"); *) ("-d", Arg.Set_string extra, "dictionary")]
(fun x -> line := x)
(Sys.argv.(0) ^ " [-d dictionary] <-i input>") ;
let dictionary = load_dictionary !extra in
if String.length !line > 0 then translate dictionary !line
else
try
while true do
read_line () |> translate dictionary
done
with End_of_file -> ()