home *** CD-ROM | disk | FTP | other *** search
Text File | 1993-03-21 | 60.0 KB | 1,436 lines |
- Newsgroups: comp.sources.misc
- From: jkl@osc.edu (Jan Labanowski)
- Subject: v36i028: translit - transliterate foreign alphabets, Part06/10
- Message-ID: <1993Mar19.224524.12252@sparky.imd.sterling.com>
- X-Md4-Signature: e6f0a520fd20024ed79b2a38ad48b863
- Date: Fri, 19 Mar 1993 22:45:24 GMT
- Approved: kent@sparky.imd.sterling.com
-
- Submitted-by: jkl@osc.edu (Jan Labanowski)
- Posting-number: Volume 36, Issue 28
- Archive-name: translit/part06
- Environment: UNIX, MS-DOS, VMS
-
- #! /bin/sh
- # This is a shell archive. Remove anything before this line, then feed it
- # into a shell via "sh file" or similar. To overwrite existing files,
- # type "sh file -c".
- # Contents: pho-koi8.rus reg_exp.h tex-koi8.rus translit.txt.A
- # Wrapped by kent@sparky on Fri Mar 19 16:00:13 1993
- PATH=/bin:/usr/bin:/usr/ucb:/usr/local/bin:/usr/lbin ; export PATH
- echo If this archive is complete, you will see the following message:
- echo ' "shar: End of archive 6 (of 10)."'
- if test -f 'pho-koi8.rus' -a "${1}" != "-c" ; then
- echo shar: Will not clobber existing file \"'pho-koi8.rus'\"
- else
- echo shar: Extracting \"'pho-koi8.rus'\" \(10735 characters\)
- sed "s/^X//" >'pho-koi8.rus' <<'END_OF_FILE'
- X# Jan Labanowski, jkl@osc.edu, Jan. 10, 1992 # File lc_koi8.dat
- X
- X# This is a transliteration data file for converting from various phonetic
- X# transliteration schemes to KOI-8 as used by RELCOM (GOST 19768-74).
- X# It is not possible to exactly represent phonetic transliteration
- X# since it is very flexible and frequently contradictory.
- X# This file is compilcated (in my humble opinion) and it will take
- X# a substantial amount of time to process longer files on a slower
- X# computer. However, this is what you get, of you want something more
- X# or less general. If your phonetic transliteration is consistent
- X# and unequivocal, you will be much better of to write a specific
- X# transliteration file, e.g. the GOST 16876-71 transliteration file (phg) or
- X# Pokrovsky scheme (php).
- X# The English text should be embraced in braces {}, while Russian one,
- X# is outside the braces.
- X# To be used with translit.c program by Jan Labanowski. For a format of
- X# this file consult translit documenation
- X
- X 1 file version number
- X
- X " " # string delimiters
- X [ ] # list delimites
- X { } # regular expression delimiters
- X
- X
- X
- X#starting sequence
- X""
- X
- X#ending sequence
- X""
- X
- X 2 # number of input SHIFT sequences
- X"" "" "" "" "" "" # no SHIFT-OUT/IN for Russian letters
- X"{" "" "" "" "}" "" # Latin text in braces {}
- X
- X 0 # number of output SHIFT sequences, two sets of input characters
- X
- X# conversion table
- X# inp_set inp_seq out_set out_seq
- X
- X# Latin(ASCII) is embraced in {}
- X 2 ["'A-Za-z] 0 ["'A-Za-z]
- X
- X# Cyrillic letters
- X
- X# If already converted to KOI8 by backstepping, send it to output
- X 1 {([\0x80-\0xFF])} 0 {\1}
- X
- X# Convert " followed by a capital letter to capital Tvyordyj znak and backstep
- X# otherwise " will be treated as a small tvyordyj znak
- X 1 {"([^A-Za-z])} 0 {"\1} # " at the end
- X 1 {([^A-Za-z])"} 0 {\1"} # " at the beginning
- X 1 {"([A-Z])} -2 {\0xFF\1} # capital Tvyordyj znak
- X
- X
- X 1 {Q[Hh]} 0 "\0xFF" # Some use it as Tvyordyj
- X 1 "qh" 0 "\0xDF" # Some use it as tvyordyj
- X 1 "\0x22" 0 "\0xDF" # tvyordyj znak
- X
- X# Convert ' preceded by a capital letter to capital Myagkij znak and backstep
- X# otherwise ' will be trated as a small myagkij znak
- X
- X 1 "''" 0 "''" # double quote
- X 1 {([^A-Za-z])'} 0 {\1'} # opening quote
- X# Muagkij znak
- X 1 {((S[Hh][Cc][Hh])|(S[Hh])|(C[Hh])|(T[Cc][Hh])|([A-Z]))'} -1 {\1\0xF8}
- X
- X 1 "Q" 0 "\0xF8"
- X
- X 1 "'" 0 "\0xD8" # myagkij znak
- X 1 "q" 0 "\0xD8"
- X
- X 1 {(([YIJ]?[EOUA])|([J]?[EOAUY]))((Y)|([IJ]))([^A-Za-z])} -1 {\1\0xEA\7} #-J
- X# 12 3 45 6 7
- X 1 {(([yij]?[eoua])|([j]?[eoauy]))((y)|([ij]))([^A-Za-z])} -1 {\1\0xCA\7} #-j
- X# 12 3 45 6 7
- X
- X# the story of ts versus c (the ts for c was a stupid idea of Library of
- X# Congress --- very, very stupid... T and S should be T and S, not C).
- X 1 "INTS" 0 "\0xE9\0xED\0xE3" #INC
- X 1 "INC" 0 "\0xE9\0xED\0xE3" #INC
- X 1 "ints" 0 "\0xC9\0xCD\0xC3" #inc
- X 1 "inc" 0 "\0xC9\0xCD\0xC3" #inc
- X 1 "CI" 0 "\0xE3\0xE9"
- X 1 "ci" 0 "\0xC3\0xC9"
- X 1 {AVIA(TS|C)} 0 "\0xE1\0xE2\0xE9\0xE1\0xE3" #aviac
- X 1 {avia(ts|c)} 0 "\0xC1\0xC2\0xC9\0xC1\0xC3"
- X 1 {tsi([iyjo])} -2 {\0xC3\0xC9\1} # ci
- X 1 {TSI([IYJO])} -2 {\0xE3\0xE9\1} # ci
- X 1 {T[Ss]([Aa])} -2 {\0xE3\1} # CA
- X 1 {t[Ss]([Aa])} -2 {\0xC3\1} # ca
- X 1 {([DdKk])T[Ss]} -1 {\1\0xE3} # DC or KC
- X 1 {([DdKk])t[Ss]} -1 {\1\0xC3} # dc or kc
- X 1 {TS([^A-Za-z])} -2 {\0xE3\1} # C
- X 1 {ts([^A-Za-z])} -2 {\0xC3\1} # c
- X
- X# Je --- people frequently write e instead of Je. E oborotnoje is
- X# frequently at the beginning of foreign origin words
- X
- X 1 "AER" 0 "\0xE1\0xFC\0xF2"
- X 1 {([Aa])er} -1 {\1\0xDC\0xD2}
- X
- X 1 {([A-Za-z])'[IiYyJj]?E} -1 {\1\0xF8\0xE5} # Je
- X 1 {([A-Za-z])'[IiYyJj]?e} -1 {\1\0xD8\0xC5} # je
- X
- X# Capital Je
- X 1 {([^A-Za-z])E(([Mm][Uu]?[^A-Za-z])|([Mm][Ll])|([Ll][^EeIiLlYyJj'])\
- X |([Ll][YyIiJj]?[Ee][^A-Za-z])|([Rr][Uu])|([Ss][HhTtLl])|([Kk][Aa]))}
- X
- X -1 {\1\0xE5\2} # Je
- X# Small je
- X 1 {([^A-Za-z])e(([Mm][Uu]?[^A-Za-z])|([Mm][Ll])|([Ll][^EeIiLlYyJj'])\
- X |([Ll][YyIiJj]?[Ee][^A-Za-z])|([Rr][Uu])|([Ss][HhTtLl])|([Kk][Aa]))}
- X -1 {\1\0xC5\2} # je
- X
- X# Capital Eh
- X 1 {([^A-Za-z])E(([Ll][Ee][KkGg])|([KLMNPRSTFklmnprstf]))} -1 {\1\0xFC\2} #Eh
- X
- X# Small eh
- X 1 {([^A-Za-z])e(([Ll][Ee][KkGg])|([KLMNPRSTFklmnprstf]))} -1 {\1\0xDC\2} #eh
- X
- X 1 {([iIOoPpUuFfYy])i[Ee]} -1 {\1\0xC5} # ie->je
- X 1 {([iIOoPpUuFfYy])I[Ee]} -1 {\1\0xE5} # ie->je
- X
- X# Eh is e oborotnoje but not at the end of the word
- X 1 {E[Hh]([^A-Za-z]+)} 0 {\0xE5\0xE8\1}
- X 1 {e[Hh]([^A-Za-z]+)} 0 {\0xC5\0xC8\1}
- X 1 {E[Hh]} 0 "\0xFC" # E oborotnoje
- X 1 "eh" 0 "\0xDC" # e oboritnoje
- X
- X# Various I kratkoe
- X 1 {J[Ii]} 0 "\0xEA" # I kratkoje
- X 1 {J[Jj]} 0 "\0xEA"
- X 1 "ji" 0 "\0xCA" # i kratkoje
- X 1 "jj" 0 "\0xCA"
- X
- X
- X
- X# SHCH
- X 1 {s[Hh][Cc][Hh]} 0 "\0xDD"
- X 1 "w" 0 "\0xDD"
- X 1 {S[Hh][Cc][Hh]} 0 "\0xFD"
- X 1 "W" 0 "\0xFD"
- X
- X
- X 1 {[YJ][Oo]} 0 "\0xB3" # capital Jo
- X 1 {J[Ee]} 0 "\0xE5" # Je
- X 1 {RIU(M[^A-Za-z])} -1 {\0xF2\0xE9\0xF5\2} # IU
- X 1 {([^A-Za-z])I([Uu][Dd])} -1 {\1\0xE9\2}
- X 1 "DIUS" 0 "\0xE4\0xE9\0xF5\0xF3"
- X 1 {[IYJ][Uu]} 0 "\0xE0" # Ju
- X 1 {([Dd])I([Aa][KkGgPp])} -1 {\1\0xE9\2} # dia
- X 1 "RIAL" 0 "\0xF2\0xE9\0xE1\0xEC" # rial
- X 1 "KIA" 0 "\0xEB\0xE9\0xE1" # kia
- X 1 {[IYJ][Aa]} 0 "\0xF1" # Ja
- X 1 {Z[Hh]} 0 "\0xF6"
- X 1 {K[Hh]} 0 "\0xE8"
- X 1 {H[Hh]} 0 "\0xE8"
- X 1 {C[Hh]} 0 "\0xFE"
- X 1 {S[Hh]} 0 "\0xFB"
- X 1 "zh" 0 "\0xD6"
- X 1 "kh" 0 "\0xC8"
- X 1 "hh" 0 "\0xC8"
- X 1 "ch" 0 "\0xDE"
- X 1 "sh" 0 "\0xDB"
- X 1 {[yj]o} 0 "\0xA3" #jo
- X 1 "je" 0 "\0xC5" #je
- X
- X 1 {([Rr])iu([Mm][^A-Za-z])} -1 {\1\0xC9\0xD5\2} # iu
- X 1 {([^A-Za-z])i(ud)} -1 {\1\0xC9\2}
- X 1 "dius" 0 "\0xC4\0xC9\0xD5\0xD3"
- X 1 {[iyj]u} 0 "\0xC0" #ju
- X 1 {([Dd])ia([kgp])} -1 {\1\0xC9\0xC1\2} # dia
- X 1 "rial" 0 "\0xD2\0xC9\0xC1\0xCC" # rial
- X 1 "kia" 0 "\0xCB\0xC9\0xC1" # kia
- X 1 {[iyj]a} 0 "\0xD1" #ja
- X
- X 1 "A" 0 "\0xE1"
- X 1 "B" 0 "\0xE2"
- X 1 "V" 0 "\0xF7"
- X 1 "G" 0 "\0xE7"
- X 1 "D" 0 "\0xE4"
- X 1 "Z" 0 "\0xFA"
- X 1 "I" 0 "\0xE9"
- X 1 "J" 0 "\0xEA" # I kratkoje
- X 1 "K" 0 "\0xEB"
- X 1 "L" 0 "\0xEC"
- X 1 "M" 0 "\0xED"
- X 1 "N" 0 "\0xEE"
- X 1 "O" 0 "\0xEF"
- X 1 "P" 0 "\0xF0"
- X 1 "R" 0 "\0xF2"
- X 1 "S" 0 "\0xF3"
- X 1 "T" 0 "\0xF4"
- X 1 "U" 0 "\0xF5"
- X 1 "F" 0 "\0xE6"
- X 1 "X" 0 "\0xE8" # Kha
- X 1 "H" 0 "\0xE8" # Kha
- X 1 "C" 0 "\0xE3"
- X 1 "Y" 0 "\0xF9"
- X 1 "E" 0 "\0xE5" #Je
- X 1 "a" 0 "\0xC1"
- X 1 "b" 0 "\0xC2"
- X 1 "v" 0 "\0xD7"
- X 1 "g" 0 "\0xC7"
- X 1 "d" 0 "\0xC4"
- X 1 "z" 0 "\0xDA"
- X 1 "i" 0 "\0xC9"
- X 1 "j" 0 "\0xCA"
- X 1 "k" 0 "\0xCB"
- X 1 "l" 0 "\0xCC"
- X 1 "m" 0 "\0xCD"
- X 1 "n" 0 "\0xCE"
- X 1 "o" 0 "\0xCF"
- X 1 "p" 0 "\0xD0"
- X 1 "r" 0 "\0xD2"
- X 1 "s" 0 "\0xD3"
- X 1 "t" 0 "\0xD4"
- X 1 "u" 0 "\0xD5"
- X 1 "f" 0 "\0xC6"
- X 1 "x" 0 "\0xC8" # kha
- X 1 "h" 0 "\0xC8" # kha
- X 1 "c" 0 "\0xC3"
- X 1 "y" 0 "\0xD9"
- X 1 "e" 0 "\0xC5" # je
- X
- X
- END_OF_FILE
- if test 10735 -ne `wc -c <'pho-koi8.rus'`; then
- echo shar: \"'pho-koi8.rus'\" unpacked with wrong size!
- fi
- # end of 'pho-koi8.rus'
- fi
- if test -f 'reg_exp.h' -a "${1}" != "-c" ; then
- echo shar: Will not clobber existing file \"'reg_exp.h'\"
- else
- echo shar: Extracting \"'reg_exp.h'\" \(749 characters\)
- sed "s/^X//" >'reg_exp.h' <<'END_OF_FILE'
- X/*
- X * Definitions etc. for regexp(3) routines.
- X *
- X * Caveat: this is V8 regexp(3) [actually, a reimplementation thereof],
- X * not the System V one.
- X */
- X#define NSUBEXP 10
- Xtypedef struct {
- X char *startp[NSUBEXP];
- X char *endp[NSUBEXP];
- X char regstart; /* Internal use only. */
- X char reganch; /* Internal use only. */
- X char *regmust; /* Internal use only. */
- X int regmlen; /* Internal use only. */
- X char program[1]; /* Unwarranted chumminess with compiler. */
- X} reg_exp;
- X
- Xextern reg_exp *reg_comp();
- Xextern int reg_exec();
- Xextern int reg_try();
- Xextern void reg_sub();
- Xextern void reg_error();
- X
- X/*
- X * The first byte of the regexp internal "program" is actually this magic
- X * number; the start node begins in the second byte.
- X */
- X#define MAGIC 0234
- END_OF_FILE
- if test 749 -ne `wc -c <'reg_exp.h'`; then
- echo shar: \"'reg_exp.h'\" unpacked with wrong size!
- fi
- # end of 'reg_exp.h'
- fi
- if test -f 'tex-koi8.rus' -a "${1}" != "-c" ; then
- echo shar: Will not clobber existing file \"'tex-koi8.rus'\"
- else
- echo shar: Extracting \"'tex-koi8.rus'\" \(10491 characters\)
- sed "s/^X//" >'tex-koi8.rus' <<'END_OF_FILE'
- X# Jan Labanowski, jkl@osc.edu, Jan. 10, 1992
- X# File tex_koi8.rus
- X
- X# This is a transliteration data file for converting from Latex to KOI8
- X# as used by RELCOM (GOST 19768-74). It assumes that the sister file:
- X# koi8-latex.rus was used to obtain the LaTeX file.
- X# Since Latex is a program, frequently complicated, it is probably possible
- X# to convert LaTeX to KOI8 only with TeX. However, if you have a simple
- X# LaTeX document, without math, tables, different font sized and shapes,
- X# you can easily convert it to KOI8 tex, by listing symbols for Russian
- X# lettes on the left side and appropriate KOI8 codes on the right.
- X# Since there are many possible assignements in LaTeX, you need to
- X# modify this file to the ones actually used. First of all, some other
- X# symbols may be used to represent Russian letters. I included some more
- X# popular sequences, but there might be more, which I am not aware of.
- X# The TeX tranliteration sequences follow AMS cyrillic convention for
- X# WNCYR fonts with cyracc.def file
- X# To be used with translit.c program by Jan Labanowski. For a format of
- X# this file consult translit documentation
- X
- X 1 file version number
- X
- X " " # string delimiters
- X [ ] # list delimites
- X { } # regular expression delimiters
- X
- X
- X#starting sequence
- X""
- X
- X
- X#ending sequence
- X""
- X
- X
- X 4 # number input SHIFT sequences
- X# The last two of the input "character sets" are used simply to delete all
- X# characters from LaTeX preamble and the \end{document} closing
- X# For set 2, note that \cyr may be followed by spaces or new lines
- X# end preceded with some LaTeX escape sequences
- X
- X#SO-match #SO-subs #Nest-up #Nest-down #SI-match #SI-subs
- X "" "" "" "" "" "" #Latin letters
- X
- X{{\\cyr[ \0x09-\0x0D]+}
- X "" "{" "}" "}" "" #cyrillic
- X
- X"\documentstyle"
- X "" "" "" {\begin{document\0x7D[\0d09-\15]*} ""
- X
- X"\end{document}" "" "" "" "" ""
- X
- X
- X 0 # number of output SHIFT sequences, only one set of output characters
- X
- X# conversion table
- X# inp_set inp_seq out_set out_seq
- X
- X 3 [\001-\255] 0 "\00" # delete LaTeX preamble
- X 4 [\001-\255] 0 "\00" # delete end{document}
- X
- X 1 [A-Za-z] 0 [A-Za-z] #Latin letters A-Z and a-z
- X
- X# If double backslash is followed by new line, skip the double backslash
- X 0 {\\\\[\0x09-\0x0d]*} 0 "\0x0D\0x0A" #restore new lines
- X
- X 0 "\\040" 0 " " # protected space
- X
- X
- X# Convert some special TeX characters
- X
- X# these do not require going out of {\cyr ....}
- X 0 "$[$" 0 "["
- X 0 "$]$" 0 "]"
- X 0 "$\wedge$" 0 "^"
- X 0 "$\lbrace$" 0 "{"
- X 0 "$\rbrace$" 0 "}"
- X 0 "$\sim$" 0 "~"
- X 0 "$\backslash$" 0 "\"
- X 0 "$\mid$" 0 "|"
- X 0 "$\star$" 0 "*"
- X 0 "$<$" 0 "<"
- X 0 "$>$" 0 ">"
- X 0 "\$" 0 "$"
- X 0 "\%" 0 "%"
- X
- X# These were represented correctly only in Latin charset
- X 1 "\_" 0 "_"
- X 1 "\&" 0 "&"
- X 1 "\#" 0 "#"
- X 1 "\@" 0 "@"
- X
- X
- X# Cyrillic letters
- X
- X# note that TS and ts sequences are checked before the \cydot is removed
- X
- X 2 {\\T[Ss][ \0x09-\0x0d]*} 0 "\0xE3"
- X 2 {T[Ss]} 0 "\0xE3" # Tse
- X
- X 2 {\\ts][ \0x09-\0x0d]*} 0 "\0xC3"
- X 2 {t[Ss]} 0 "\0xC3" # tse
- X
- X 2 {\\S[Hh][Cc][Hh][ \0x09-\0x0d]*} 0 "\0xFD"
- X 2 {S[Hh][Cc][Hh]} 0 "\0xFD"
- X
- X 2 {\\s[Hh][Cc][Hh][ \0x09-\0x0d]*} 0 "\0xDD"
- X 2 {s[Hh][Cc][Hh]} 0 "\0xDD"
- X
- X 2 {\\Cdprime[ \0x09-\0x0d]*} 0 "\0xFF" # Tverdyj znak
- X 2 {\\T[Zz][ \0x09-\0x0d]*} 0 "\0xFF"
- X
- X 2 {\\Cprime[ \0x09-\0x0d]*} 0 "\0xF8" # Myagkij znak
- X 2 {\\M[Zz][ \0x09-\0x0d]*} 0 "\0xF8"
- X
- X 2 {\\cdprime[ \0x09-\0x0d]*} 0 "\0xDF" # tverdyj znak
- X 2 {\\tz[ \0x09-\0x0d]*} 0 "\0xDF"
- X
- X 2 {\\cprime[ \0x09-\0x0d]*} 0 "\0xD8" # myagkij znak
- X 2 {\\mz[ \0x09-\0x0d]*} 0 "\0xD8"
- X
- X 2 {\\u[ \0x09-\0x0d]*I} 0 "\0xEA" # I kratkoje
- X 2 "\u{I}" 0 "\0xEA"
- X 2 {\\[Uu]I[ \0x09-\0x0d]*} 0 "\0xEA"
- X
- X 2 {\\u[ \0x09-\0x0d]*i} 0 "\0xCA" # i kratkoje
- X 2 {\\ui[ \0x09-\0x0d]*} 0 "\0xCA"
- X 2 "\u{i}" 0 "\0xCA"
- X
- X 2 {\\`[ \0x09-\0x0d]*E} 0 "\0xFC" # E obortnoye
- X 2 "\`{E}" 0 "\0xFC"
- X
- X 2 {\\`[ \0x09-\0x0d]*e} 0 "\0xDC" # e oborotnoye
- X 2 "\`{e}" 0 "\0xDC"
- X
- X 2 {\\K[Hh][ \0x09-\0x0d]*} 0 "\0xE8"
- X 2 {K[Hh]} 0 "\0xE8"
- X
- X 2 {\\k[Hh][ \0x09-\0x0d]*} 0 "\0xC8"
- X 2 {k[Hh]} 0 "\0xC8"
- X
- X 2 {\\T[Cc][Hh][ \0x09-\0x0d]*} 0 "\0xFE"
- X 2 {\\C[Hh][ \0x09-\0x0d]*} 0 "\0xFE"
- X 2 {C[Hh]} 0 "\0xFE"
- X
- X
- X 2 {\\S[Hh][ \0x09-\0x0d]*} 0 "\0xFB"
- X 2 {S[Hh]} 0 "\0xFB"
- X
- X 2 {\\c[Hh][ \0x09-\0x0d]*} 0 "\0xDE"
- X 2 {\\t[Cc][Hh][ \0x09-\0x0d]*} 0 "\0xDE"
- X 2 {c[Hh]} 0 "\0xDE"
- X
- X 2 {\\s[Hh][ \0x09-\0x0d]*} 0 "\0xDB"
- X 2 {s[Hh]} 0 "\0xDB"
- X
- X 2 {\\Z[Hh][ \0x09-\0x0d]*} 0 "\0xF6"
- X 2 {Z[Hh]} 0 "\0xF6"
- X
- X 2 {\\z[Hh][ \0x09-\0x0d]*} 0 "\0xD6"
- X 2 {z[Hh]} 0 "\0xD6"
- X
- X 2 {\\Y[Uu][ \0x09-\0x0d]*} 0 "\0xE0"
- X 2 {Y[Uu]} 0 "\0xE0"
- X
- X 2 {\\Y[Aa][ \0x09-\0x0d]*} 0 "\0xF1"
- X 2 {Y[Aa]} 0 "\0xF1"
- X
- X 2 {\\y[Uu][ \0x09-\0x0d]*} 0 "\0xC0"
- X 2 {y[Uu]} 0 "\0xC0"
- X
- X 2 {\\y[Aa][ \0x09-\0x0d]*} 0 "\0xD1"
- X 2 {y[Aa]} 0 "\0xD1"
- X
- X 2 {\\"[ \0x09-\0x0D]*e} 0 "\0xA3" # small \"e (yo)
- X 2 "\\0o42{e}" 0 "\0xA3"
- X 2 {\\y[Oo][ \0x09-\0x0D]*} 0 "\0xA3"
- X
- X 2 {\\"[ \0x09-\0x0D]*E} 0 "\0xB3" # capital \"E (Yo)
- X 2 "\\0o42{E}" 0 "\0xB3"
- X 2 {\\Y[Oo][ \0x09-\0x0D]*} 0 "\0xB3"
- X
- X 2 {\\cydot[ \0x09-\0x0d]*} 0 "" #\cydot out
- X
- X 2 "H" 0 "\0xE8"
- X 2 "h" 0 "\0xC8"
- X 2 "W" 0 "\0xFD"
- X 2 "w" 0 "\0xDD"
- X 2 "X" 0 "\0xFB"
- X 2 "x" 0 "\0xDB"
- X
- X 2 "A" 0 "\0xE1"
- X 2 "B" 0 "\0xE2"
- X 2 "V" 0 "\0xF7"
- X 2 "G" 0 "\0xE7"
- X 2 "D" 0 "\0xE4"
- X 2 "E" 0 "\0xE5"
- X 2 "Z" 0 "\0xFA"
- X 2 "I" 0 "\0xE9"
- X 2 "K" 0 "\0xEB"
- X 2 "L" 0 "\0xEC"
- X 2 "M" 0 "\0xED"
- X 2 "N" 0 "\0xEE"
- X 2 "O" 0 "\0xEF"
- X 2 "P" 0 "\0xF0"
- X 2 "R" 0 "\0xF2"
- X 2 "S" 0 "\0xF3"
- X 2 "T" 0 "\0xF4"
- X 2 "U" 0 "\0xF5"
- X 2 "F" 0 "\0xE6"
- X 2 "C" 0 "\0xE3"
- X 2 "Y" 0 "\0xF9"
- X 2 "a" 0 "\0xC1"
- X 2 "b" 0 "\0xC2"
- X 2 "v" 0 "\0xD7"
- X 2 "g" 0 "\0xC7"
- X 2 "d" 0 "\0xC4"
- X 2 "e" 0 "\0xC5"
- X 2 "z" 0 "\0xDA"
- X 2 "i" 0 "\0xC9"
- X 2 "k" 0 "\0xCB"
- X 2 "l" 0 "\0xCC"
- X 2 "m" 0 "\0xCD"
- X 2 "n" 0 "\0xCE"
- X 2 "o" 0 "\0xCF"
- X 2 "p" 0 "\0xD0"
- X 2 "r" 0 "\0xD2"
- X 2 "s" 0 "\0xD3"
- X 2 "t" 0 "\0xD4"
- X 2 "u" 0 "\0xD5"
- X 2 "f" 0 "\0xC6"
- X 2 "c" 0 "\0xC3"
- X 2 "y" 0 "\0xD9"
- X
- X# Trash {}
- X 0 "{" 0 ""
- END_OF_FILE
- if test 10491 -ne `wc -c <'tex-koi8.rus'`; then
- echo shar: \"'tex-koi8.rus'\" unpacked with wrong size!
- fi
- # end of 'tex-koi8.rus'
- fi
- if test -f 'translit.txt.A' -a "${1}" != "-c" ; then
- echo shar: Will not clobber existing file \"'translit.txt.A'\"
- else
- echo shar: Extracting \"'translit.txt.A'\" \(34820 characters\)
- sed "s/^X//" >'translit.txt.A' <<'END_OF_FILE'
- X
- X
- X
- XTRANSLIT(JKL) Version 1.0 TRANSLIT(JKL)
- X
- X
- X
- XNAME
- X TRANSLIT
- X Program to transliterate texts in different character
- X sets. The program converts input character codes (or
- X sequences of codes) to a different set of output char-
- X acter codes (or sequences of codes). Intended for
- X transliteration to/from phonetic representation of
- X foreign letters with Latin letters from/to special
- X national codes used for these letters. It supports
- X simple matches, character lists and flexible matches
- X via regular expressions. The new transliteration
- X schemes are easily added by creating simple transli-
- X teration tables. Multiple character sets are supported
- X for input and output. It does not yet support UNICODE,
- X but some day it will.
- X
- X
- XCOPYRIGHT
- X Copyright (c) 1993 Jan Labanowski and JKL Enterprises, Inc.
- X You may distribute the Software only as a complete set of
- X files. You may distribute the modified Software only if you
- X retain the Copyright notice and you do not delete original
- X code, data, documentation and associated files. The
- X Software is copyrighted. You may not sell the software or
- X incorporate it in the commercial product without written
- X permission from Jan Labanowski or JKL Enterprises, Inc. You
- X are allowed to charge for media and copying if you distri-
- X bute the whole unaltered package.
- X
- X
- XSYNOPSIS
- X translit [ -i inpfile ][ -o outfile ][ -d ][ -t transtbl |
- X transtbl ]
- X
- X
- XOPTIONS
- X -i inpfile
- X inpfile is a name of input file to be transliterated.
- X If "-i" is not specified, the input is taken from stan-
- X dard input.
- X
- X -o outfile
- X outfile is an output file, where the transliterated
- X text is stored. If "-o" is not specified, the output
- X is directed to the standard output. Program will not
- X overwrite the existing file. If file exists, you need
- X to delete it first.
- X
- X -d Some information on character codes read from transli-
- X teration table file are sent to standard error
- X ("stderr"). Useful when developing new transliteration
- X tables.
- X
- X
- X
- XJKL Last change: 23-Jan-1993 1
- X
- X
- X
- X
- X
- X
- XTRANSLIT(JKL) Version 1.0 TRANSLIT(JKL)
- X
- X
- X
- X -t transtbl
- X transtbl is a transliteration table file which you want
- X to use. The "-t" option may be omitted if the transtbl
- X is specified as the last parameter on the command line.
- X The program first tries to locate transtbl file in the
- X current directory, and if not found, it searches the
- X directory chosen at compilation/installation time in
- X "paths.h". If no "transtbl" is given, the default file
- X name specified in "paths.h" is taken. The
- X compile/installation time defaults in "paths.h" for the
- X search directory and the default file name can be
- X overiden by setting environment variables: TRANSP and
- X TRANSF, respectively (see below).
- X
- X
- XENVIRONMENT VARIABLES
- X The default path to the directory holding transliteration
- X tables can be overiden by setting environment variable
- X TRANSP. The default name for the transliteration table can
- X be overiden by setting TRANSF environment variable. However,
- X when the transliteration file is given on the command line,
- X it will overide the defaults and environment setting. Here
- X are some examples of setting environment variables for dif-
- X ferent operating systems:
- X
- X UN*X System
- X If you are using csh (C-shell):
- X setenv TRANSP /home/john/translit/
- X setenv TRANSF koi8-tex.rus
- X If you are using sh (Bourne Shell):
- X set TRANSP=/home/john/translit/
- X export TRANSP
- X set TRANSF=koi8-tex.rus
- X export TRANSF
- X VAX-VMS System
- X TRANSP:==SYS$USER:[JOHN.TRANSLIT]
- X TRANSF:==KOI8-TEX.TBL
- X PC-DOS or MS-DOS
- X SET TRANSP=C:\JOHN\TRANSLIT\
- X SET TRANSF=KOI8-TEX.TBL
- X Note that the directory path has to include concluding
- X slashes, \ or /.
- X
- X
- X
- XEXAMPLES
- X cat text.koi8 | translit koi8-tex.rus > text.tex
- X in UN*X is equivalent to:
- X
- X translit -t koi8-tex.rus -o text.tex -i text.koi8
- X and converts file text.koi8 to file text.tex using transli-
- X teration specified in the file koi8-tex.rus.
- X
- X
- X
- XJKL Last change: 23-Jan-1993 2
- X
- X
- X
- X
- X
- X
- XTRANSLIT(JKL) Version 1.0 TRANSLIT(JKL)
- X
- X
- X
- X translit -i text.koi8 koi8-cl.rus
- X displays the converted text from file text.koi8 on your ter-
- X minal. The conversion table is koi8-cl.rus (KOI8 --> Library
- X of Congress).
- X
- X translit -i text.alt -t alt-koi8.rus | translit -o
- X text.tex -t koi8-tex.rus
- X is essentially equivalent to the following two commands in
- X UN*X or MS-DOS:
- X translit -i text.alt -o junkfile -t alt-koi8.rus
- X translit -i junkfile -o text.tex -t koi8-tex.rus
- X and converts the file in ALT character set to a LaTeX file
- X for printing.
- X
- X translit -i russ.txt pho-koi8.rus | translit -o
- X russ.tex koi8-tex.rus
- X converts file russ.txt from phonetic transliteration to
- X LaTeX file russ.tex for printing.
- X
- X
- X
- X
- XTRANSLITERATION TABLES
- X The following transliteration files are available with the
- X current distribution. Consult the comments in the individual
- X files for details.
- X
- X koi8-tex.rus
- X Conversion table which changes the file in KOI8 (8 bit
- X character set used by RELCOM news service) to a LaTeX
- X file for printing with AMS WNCYR fonts.
- X
- X tex-koi8.rus
- X Conversion table for the LaTeX to KOI8 conversion. Note
- X that it will not handle complicated cases, since LaTeX
- X is a program, and only TeX can convert a LaTeX source
- X to the characters. However, it should work OK for sim-
- X ple cases of text only files, and may need some editing
- X for complicated cases.
- X
- X alt-gos.rus
- X This is a transliteration data file for converting from
- X ALT (Bryabrins alternativnyj variant used in many popu-
- X lar wordprocessors) to GOSTSCII 84 (approx. ISO-8859-
- X 5?)
- X
- X alt-koi8.rus
- X This is a transliteration data file for converting from
- X ALT to KOI8. KOI8 is meant to be GOST 19768-74 (as
- X used by RELCOM).
- X
- X gos-alt.rus
- X
- X
- X
- XJKL Last change: 23-Jan-1993 3
- X
- X
- X
- X
- X
- X
- XTRANSLIT(JKL) Version 1.0 TRANSLIT(JKL)
- X
- X
- X
- X This is a transliteration data file for converting
- X GOSTSCII 84 (approx. ISO-8859-5?) to ALT (Bryabrins
- X alternativnyj variant)
- X
- X gos-koi8.rus
- X This is a transliteration data file for converting
- X GOSTSCII 84 (approx. ISO-8859-5?) to KOI8 used by REL-
- X COM KOI8 is meant to be GOST 19768-74
- X
- X koi8-alt.rus
- X This is a transliteration data file for converting from
- X KOI8. KOI8 is meant to be GOST 19768-74, to ALT
- X (Bryabrins alternativnyj variant)
- X
- X koi8-gos.rus
- X This is a transliteration data file for converting from
- X KOI8 (Relcom). KOI8 is meant to be GOST 19768-74, to
- X GOSTSCII 84 (approx. ISO-8859-5)
- X
- X koi8-7.rus
- X This file converts from KOI8 to KOI7.
- X
- X koi7-8.rus
- X This file converts from KOI7 to KOI8. Before you
- X attempt the conversion, you might need to perform a
- X simple edit on your file. You MUST read the comments in
- X koi7-8.rus before you attempt this conversion.
- X
- X koi7nl-8.rus
- X This file assumes that there are only Russian letters
- X (no Latin) in the input file. If you have Latin
- X letters, and you inserted SHIFT-OUT/IN characters, use
- X file koi7-8.rus.
- X
- X koi8-lc.rus
- X This file converts KOI8 to the Library of Congress
- X transliteration. Some extensions are added.
- X
- X koi8-php.rus
- X This file converts KOI8 to the Pokrovsky translitera-
- X tion.
- X
- X php-koi8.rus
- X This file converts from Pokrovsky transliteration to
- X KOI8.
- X
- X koi8-phg.rus
- X This file converts from KOI8 to GOST transliteration.
- X
- X phg-koi8.rus
- X This file converts from GOST transliteration to KOI8.
- X
- X
- X
- X
- XJKL Last change: 23-Jan-1993 4
- X
- X
- X
- X
- X
- X
- XTRANSLIT(JKL) Version 1.0 TRANSLIT(JKL)
- X
- X
- X
- X pho-koi8.rus
- X This is a table which will convert from many "phonetic"
- X transliteration schemes to KOI8. It is elaborate and it
- X takes a lot of time to transliterate the file using
- X this table. Some transliterations are hopeless and
- X internally inconsistent (as humans...), so the results
- X cannot be bug free. You might want to modify the file,
- X if your transliteration patterns are different than
- X those assumed in this file. You may also want to sim-
- X plify this file if the phonetic transliteration you are
- X converting is a sound one (most are not, e.g., they use
- X e for je and e oborotnoye, ts for c and t-s, h for kha,
- X i for i-kratkoe, etc.).
- X
- X
- X
- XINTRODUCTION
- X If you do not intend to write your own transliteration
- X tables, you may skip this description and go directly to the
- X installation and copyright sections. However, you might want
- X to read this material anyhow, to better understand the traps
- X and complexities of transliteration. It is frequently
- X necessary to transliterate text, i.e., to change one set of
- X characters (or composite characters, phonemes, etc.) to
- X another set.
- X
- X On computers, the transliteration operation consists of con-
- X verting the input file in some character set to the output
- X file in another character set.
- X
- X In the simplest case, the single characters are transli-
- X terated, i.e, their codes are changed according to some
- X transliteration table. This is called remapping and, assum-
- X ing the one-to-one mapping, the task can be accomplished by
- X a simple pseudo program:
- X new_char_code = character_map[old_char_code];
- X
- X If the one-to-one correspondence does not exist (i.e., some
- X codes may be present in one set, but do not have correspond-
- X ing codes in another set), precise transliteration is not
- X possible. In such cases there are 3 obvious possibilities:
- X 1. skip characters which do not have counterparts,
- X 2. retain unchanged codes of these characters,
- X 3. convert the codes to multicharacter sequences.
- X In some cases, the file can contain more than one character
- X sets, e.g., the file can contain Latin characters (e.g.
- X English text) and Cyrillic characters (e.g. Russian text).
- X If the character codes assigned to characters in different
- X sets do not overlap, this is still a simple mapping problem.
- X This is a case with KOI8 or GOSTCII character tables for
- X Russian, which reserve the lower 127 codes for standard
- X ASCII codes (which include all Latin characters) and
- X
- X
- X
- XJKL Last change: 23-Jan-1993 5
- X
- X
- X
- X
- X
- X
- XTRANSLIT(JKL) Version 1.0 TRANSLIT(JKL)
- X
- X
- X
- X characters with codes above 127 for Cyrillic letters.
- X
- X If character codes overlap, there is a SHIFT-OUT/SHIFT-IN
- X technique in which the meaning of the character sequence is
- X determined by an opening code (or sequence of characters
- X codes). In this case, the meaning of the series of charac-
- X ters is determined by the SHIFT-OUT character (or sequence)
- X which precedes them. The SHIFT-IN character (or sequence)
- X following the series of characters returns the "reader" to
- X the default or previous status. To schemes are used:
- X (char_set_1)(SHIFT-IN[1])(SHIFT-OUT[2])(char_set_2)...
- X or
- X (char_set_1)(SHIFT-OUT[2])(char_set_2)(SHIFT-
- X OUT[1])char_set_1...
- X
- X Since computer keyboards, screens, printers, software, etc.,
- X are by necessity language specific (the most popular being
- X ASCII), there is a problem of typing foreign language text
- X which contains letters different than standard Latin alpha-
- X bet. For this reason, many transliteration schemes use
- X several Latin letters to represent a single letter of
- X foreign alphabet, for example:
- X zh is used to represent cyrillic letter zhe, \"o may be
- X used to represent the o umlaut, etc.
- X
- X If there is one-to-one mapping of such sequences to another
- X alphabet, it is also easy to process. However, it is neces-
- X sary to substitute longest sequences first. For example, a
- X frequently used transliteration for cyrillic letters:
- X shch --- letter shcza 221 (decimal KOI8 code)
- X sh --- letter sha 219
- X ch --- letter cze 222
- X c --- letter tse 195
- X h --- letter kha 200
- X a --- letter a 193
- X
- X Obviously, in this case, we should proceed first with con-
- X verting all shch sequences to shcha letter, then two-
- X character sh and ch, and then single character c and h.
- X Generally, for the one-to-one transliteration, the longest
- X sequences should be precessed first, and the order of
- X conversion within sequences of the same length makes no
- X difference. For example, converting the word "shchah" to
- X KOI8 should proceed in a following way:
- X shchah --> (221)ah, (221)ah --> (221)(193)h, (221)(193)h
- X --> (221)(193)(200)
- X There is a multitude of reasons why transliteration is done.
- X I wrote this program having in mind the following ones:
- X 1) to print cyrillic text using TeX/LaTeX and cyrillic
- X fonts
- X 2) to read KOI8 encoded messages from Russia on my ASCII
- X terminal.
- X
- X
- X
- XJKL Last change: 23-Jan-1993 6
- X
- X
- X
- X
- X
- X
- XTRANSLIT(JKL) Version 1.0 TRANSLIT(JKL)
- X
- X
- X
- X However, I was trying to make it flexible to accommodate
- X other uses.
- X
- X
- XPROGRAM OPERATION
- X The program converts the input file to an output file using
- X transliteration rules from the transliteration rule file
- X which you specify with option -t. Some examples of transli-
- X teration rule files are enclosed. Before program can be
- X used, the transliteration rules need to be specified.
- X
- X These are given as a file which consist of the following
- X parts described below:
- X 1) File format number (it is 1 at this moment)
- X 2) Delimiters used to enclose a) simple strings, b) char-
- X acter lists, c) regular expressions
- X 3) Starting sequence for output
- X 4) Ending sequence for output
- X 5) Number of input "character sets"
- X 6) SHIFT-OUT/SHIFT-IN sequences for each input character
- X set
- X 7) Number of output "character sets"
- X 8) SHIFT-OUT/SHIFT-IN sequences for each output character
- X set
- X 9) Transliteration table
- X
- X GENERAL COMMENTS
- X The transliteration rules file consists of comments and
- X data. The comments may be included in the file as:
- X a) line comments --- lines starting with ! or # character
- X (# or ! must be in the first column of a line) are
- X treated as comments and are not read in by the program.
- X b) comments following all required entries on the line.
- X They must be separated by at least one space from the
- X last data entry on the line and need not start with any
- X particular character. These comments cannot be used
- X within multiline sequences.
- X
- X The data entries consist of integer numbers and strings.
- X The strings may represent:
- X a) plain strings
- X b) character lists
- X c) regular expressions
- X
- X All strings which appear in the file, are processed through
- X the "string processor", which allows entering unprintable
- X characters as codes. The character code is specified as a
- X backslash "\" followed by at least 2 digit(s) (i.e., \01
- X produces code=1, but \1 is passed unchanged). The following
- X formats are supported:
- X \0123 character of octal code 123 (when leading zero
- X present)
- X
- X
- X
- XJKL Last change: 23-Jan-1993 7
- X
- X
- X
- X
- X
- X
- XTRANSLIT(JKL) Version 1.0 TRANSLIT(JKL)
- X
- X
- X
- X \123 character of decimal code 123 (when leading digit
- X is not zero)
- X \0o123 or \0O123 character of octal code 123
- X \0d123 or \0D123 character of decimal code 123
- X \0xA3 or \0XA3 or \0xa3 character of hexadecimal code
- X A3
- X
- X The allowed digits are 0-7 for octal codes, 0-9 for decimal
- X codes and 0-F (and/or 0-f) for hexadecimal codes. In a
- X situation when code has to be followed by a digit character,
- X you need to enter the digit as a code. E.g., if you want
- X character \0xA3 followed by a letter C, you need to specify
- X letter C as a code (\0x43 or \103 or \0o103 or \0d67) and
- X type the sequence as, e.g., \0xA3\103. Character resulting
- X in a code 0 (zero) (e.g., \00) is special. It tells: "skip
- X everything what follows me in this string". It does not
- X make sense to use it, since you can always terminate the
- X sequence with a delimiter. When you use an empty string as
- X a matching sequence, remember that it does not match any-
- X thing.
- X
- X If the line with entries is too long, you can break it
- X between the fields. If the string is too long to fit a
- X line, you can break it before any nonblank character by the
- X \ (backslash) followed by white space (i.e., new lines,
- X spaces, tabs, etc.). The \ and the following white space
- X will be removed from the string by the string preprocessor.
- X However, you are not allowed to break the individual charac-
- X ter codes (and you probably would not do it ever for
- X aestetic purposes). For example:
- X "experi\
- X mental design"
- X is equivalent to:
- X "experimental design"
- X while:
- X "experimental\
- X design"
- X is equivalent to:
- X "experimentaldesign"
- X If you need to have \ followed by a space in your string,
- X you need to enter either a backslash or a space following it
- X as an explicit character code, for example:
- X "\\0o40"
- X will produce a \ followed by the space, while the string:
- X "\ "
- X will be empty.
- X
- X The preprocessor knows only about comments, plain charac-
- X ters, character codes, and continuation lines. However, some
- X characters and their combinations may have a special meaning
- X in lists and regular expressions.
- X
- X
- X
- X
- XJKL Last change: 23-Jan-1993 8
- X
- X
- X
- X
- X
- X
- XTRANSLIT(JKL) Version 1.0 TRANSLIT(JKL)
- X
- X
- X
- X DETAILS OF FILE STRUCTURE
- X
- X
- X Ad.1) File format number. This is simply a digit 1 on a line
- X by itself at the moment. This entry is included to allow
- X future extensions of the transliteration description file
- X without the need to modify older transliteration descrip-
- X tions (program will read data according to the current
- X file format number given in the file).
- X
- X Ad.2) String delimiters. The subsequent 3 lines specify
- X pairs of single character delimiters for 3 types of text
- X data. The line format is:
- X opening_character closing_character.
- X These are needed to mark the beginning/end and the type
- X of the text data. Each string (text datum) is saved
- X starting from the first character after opening delim-
- X iter, and ends at the last character before the closing
- X delimiter. If you need to use the closing delimiter
- X within a string, you need to specify it as its code
- X (e.g., if you are using () pair as delimiters, specify
- X ")" as \0x29). The opening delimiter may be the same or
- X different from the closing delimiter.
- X
- X a) The first line contains characters used to enclose
- X (bracket) a plain string. Plain strings are directly
- X matched to input data or directly sent to output. I
- X suggest to stick to " " pair for plain strings. The
- X ASCII code for " is \0d34 = \0x22 = \0o42 if you need
- X it inside the string itself.
- X
- X b) The second line contains characters to mark the begin-
- X ning and the end of the list. Lists are used to
- X translate single character codes. I suggest [ and ]
- X delimiters for the list (ASCII code of "]" is: \0d93 =
- X \0x5D = \0o135). The lists may include ranges, for
- X example: [a-zA-Z0-9] will include all Latin letters
- X (small and capital) and digits. Note that order is
- X important: [a-d] is equivalent to [abcd], while [d-a]
- X will result in an error. If you want to include "-"
- X (minus) in the list, you need to place it as the first
- X or the last character. There are only two special char-
- X acters on the list, the "-" described above, and the
- X "]" character. You need to enter the "]" as its code.
- X E.g., for ASCII character table [*--] is equivalent to
- X [*+,-], is equivalent to [\42\43\44\45]. The order of
- X characters in the list does not matter unless the input
- X list corresponds to the output list (this will be
- X explained later). Empty lists do not make sense.
- X
- X c) The third line of delimiter specification contains
- X delimiters for regular expressions and substitution
- X
- X
- X
- XJKL Last change: 23-Jan-1993 9
- X
- X
- X
- X
- X
- X
- XTRANSLIT(JKL) Version 1.0 TRANSLIT(JKL)
- X
- X
- X
- X expressions. These strings are used for "flexible"
- X matches to the text in the input file. They are very
- X similar to the ones used in UN*X for searching text in
- X utilities like: grep, sed, vi, awk, etc., though only
- X a subset of full UN*X regular expression syntax is used
- X here. I suggest enclosing them within braces { and }
- X (ASCII code for } is \0d125 = \0x7D = \0o175). Actu-
- X ally, regular expressions can only be used for input
- X sequences, and for output sequences the {} are used to
- X enclose substitution sequences. This will be explained
- X below. The description of the syntax for
- X regular/substitution expressions is adapted from the
- X documentation for the regexp package of Henry Spencer,
- X University of Toronto --- this regular expression pack-
- X age was incorporated, after minute modifications, into
- X the program.
- X
- X
- X REGULAR EXPRESSION SYNTAX
- X A regular expression is zero or more branches,
- X separated by `|'. It matches anything that matches
- X one of the branches. The `|' simply means "or".
- X A branch is zero or more pieces, concatenated. It
- X matches a match for the first, followed by a match
- X for the second, etc.
- X A piece is an atom possibly followed by `*', `+',
- X or `?'. An atom followed by `*' matches a
- X sequence of 0 or more matches of the atom. An atom
- X followed by `+' matches a sequence of 1 or more
- X matches of the atom. An atom followed by `?' matches
- X zero or one occurrences of atom.
- X An atom is a regular expression in parentheses
- X (matching a match for the regular expression), a
- X range (see below), `.' (matching any single charac-
- X ter), a `\' followed by a single character (matching
- X that character), or a single character with no other
- X significance (matching that character).
- X A range is a sequence of characters enclosed in
- X `[]'. It normally matches any single character from
- X the sequence. If the sequence begins with `^', it
- X matches any single character not from the rest of the
- X sequence. If two characters in the sequence are
- X separated by `-', this is shorthand for the full list
- X of ASCII characters between them (e.g. `[0-9]'
- X matches any decimal digit). To include a literal `]'
- X in the sequence, make it the first character (follow-
- X ing a possible `^'). To include a literal `-', make it
- X the first or last character. The regular expression
- X can contains subexpressions which are enclosed in a ()
- X pair. These subexpressions are numbered 1 to 9 and can
- X be nested. The numbering of subexpressions is given in
- X the order of their opening parentheses "(". For
- X
- X
- X
- XJKL Last change: 23-Jan-1993 10
- X
- X
- X
- X
- X
- X
- XTRANSLIT(JKL) Version 1.0 TRANSLIT(JKL)
- X
- X
- X
- X example:
- X (111)...(22(333)222(444)222)...(555)
- X Note that expression 2 contains within itself expres-
- X sions 3 and 4.
- X These subexpressions can be referenced in the substitu-
- X tion string which is described below in the paragraph
- X below, or can be used to delimit atoms.
- X Examples:
- X {[\0d32\0d09]\0d10} --- will match space or tab fol-
- X lowed by new line
- X {[Tt][Ss]} --- will match TS, Ts, tS and ts
- X {TS|Ts|tS|ts} --- same as above
- X {[\0d09-\0d15 ][^hH][^uU][a-zA-Z]*[\0d09-\0d15 ]} ---
- X all words which do not start with hu, Hu, hU, HU.
- X There is a space between \0d15 and ].
- X Note that specifying expressions like {.*} (i.e.,
- X match all characters) does not make much sense,
- X since it would mean here: match the whole input
- X file. However, expressions like {A.*B} should be
- X acceptable, since they match a pair of A and B, and
- X everything in between them, e.g. for a string like:
- X "This is Mr. Allen and this is Mr. Brown." this
- X expression should match the string: "Allen and this
- X is Mr. B".
- X Remember to put a backslash "\" in front of the follow-
- X ing characters: .[()|?+*\ if you want their literal
- X meaning outside the range enclosed in []. Inside the
- X range they have their literal meaning. If you know the
- X syntax of UN*X regular expressions, please note that ^
- X and $ anchors are not supported and are treated as nor-
- X mal characters (with the exception of ^ negation within
- X []).
- X
- X SUBSTITUTION EXPRESSIONS
- X After finding a match for a regular expression in the
- X input text, a substitution is made. It can be a simple
- X substitution where the whole matching string is
- X replaced by another string, or it may reuse a portion
- X or the whole matching string. The subexpressions (the
- X ones enclosed in parentheses) within the regular
- X expression which matched the input text can be refer-
- X enced in the substitution expression. Only the follow-
- X ing characters have special meaning within substitution
- X expression:
- X & --- will put the whole matching string.
- X \1 --- will put the match for the 1st subexpression
- X in ().
- X \2 --- will put the string which matched 2nd subex-
- X pression, etc.
- X \9 --- will place in a replacement string the 9th
- X subexpression (provided that there was 9 () pairs
- X in the regular expression)
- X
- X
- X
- XJKL Last change: 23-Jan-1993 11
- X
- X
- X
- X
- X
- X
- XTRANSLIT(JKL) Version 1.0 TRANSLIT(JKL)
- X
- X
- X
- X Only 9 subexpressions are allowed. All other charac-
- X ters and sequences within the substitution expression
- X will be placed in a substitution string as written. To
- X be able to put a single backslash there, you need to
- X put two of them. To be able to place the unchanged
- X codes of the above characters (i.e., to make them
- X literals), you need to precede them with a backslash
- X "\", i.e., to get & in the output string you need to
- X write it as \&. Similarly, to place literal \1, \2,
- X etc., you need to enter it as \\1, \\2, etc. Note that
- X characters .+[]()^, etc. which had a special meaning in
- X the regular expressions, do not have any special mean-
- X ing in the substitution expression and will be output
- X as written.
- X Example:
- X The regular expression:
- X {([Tt])([Ss])} and the corresponding substitution
- X expression {\1.\2} puts a period between adjoining
- X letters t and s preserving their letter case.
- X The expression:
- X {([A-Za-z]+)-[ \0x09]*([\0x0A-\0x0D]+)[ \0x09]*([A-
- X Za-z,.?;:"\)'`!]+)[ \0x09]}
- X and the substitution expression {\1\3\2} dehyphen-
- X ate words (when you understand this one, you are a
- X guru...). For example: con- (NL)cert is changed
- X to concert(NL), where NL stands for New Line. It
- X looks for one or more letters (saves them as sub-
- X string 1) followed by a hyphen (which may be fol-
- X lowed by zero or more spaces or tabs). The hyphen
- X must be followed by a NewLine (ASCII characters
- X 0A-0D hex form various new line sequences) and
- X saves NewLine sequence as a subexpression 2. Then
- X it looks for zero or more tabs and spaces (at the
- X beginning of the line). Then it looks for the rest
- X of the hyphenated word and saves it as substring 3.
- X The word may have punctuation attached. Then it
- X looks again for some spaces or tabs. The substitu-
- X tion expression junks all sequences which were not
- X within (), i.e., hyphen and spaces/tabs and inserts
- X only substrings but in a different order. The \1
- X (word beginning) is followed by \3 (word end) and
- X followed by the NewLine --- \2. The {\2\1\3} would
- X be probably equally good, though you would need to
- X move the punctuation matching to the beginning of
- X the regular expression.
- X Ad.3) Starting sequence. This sequence will be sent to the
- X output before any text. It is enclosed in the pair of
- X string delimiters. I use it to output LaTeX preamble.
- X However, it can be empty, if not used. The (sequence)
- X may contain any characters, including new lines, etc.
- X Example:
- X "" # empty sequence
- X
- X
- X
- XJKL Last change: 23-Jan-1993 12
- X
- X
- X
- X
- X
- X
- XTRANSLIT(JKL) Version 1.0 TRANSLIT(JKL)
- X
- X
- X
- X Example:
- X "\documentstyle{article}
- X \input cyracc
- X \begin{document}
- X "
- X is right (note a new line at the end), but
- X "\documentstyle{article}
- X \input cyracc # this comment will be included!
- X \begin{document}" # while this will not
- X is wrong.
- X
- X Ad.4) Ending sequence. Similar to 1), but will be appended
- X at the end of the output file.
- X For example:
- X "\end{document}
- X "
- X
- X Ad.5) Number of input character sets. For example, in some
- X incarnation of KOI7, there are two character sets: Latin
- X and Cyrillic. Cyrillic character sequence follows SHIFT-
- X OUT character (CTRL-N), \0x0e, and is terminated by
- X SHIFT-IN character (CTRL-O), \0x0f. Another way of look-
- X ing at it is that Latin characters follow CTRL-O and
- X cyrillic ones follow CTRL-N.
- X
- X If there is only one character set on input you should
- X specify 0 as a number of input char sets, since the input
- X file obviously does not contain any SHIFT-OUT/IN
- X sequences.
- X
- X Ad.6) SHIFT-OUT/SHIFT-IN sequences for each input character
- X set. These lines appear only if you specified nonzero
- X number of character sets. These lines contain also "nest-
- X ing sequences", which will be explained later in this
- X section. You do not use "nesting sequences" frequently,
- X and let us assume for a moment that nesting data are
- X empty strings. The strings or regular expressions speci-
- X fied here are matched with the contents of input text. If
- X match was found, the matching sequence is usually deleted
- X from the input text and:
- X a) for SHIFT-OUT sequence: the current input character
- X set number is changed to the new one corresponding to
- X the SHIFT-OUT sequence, or
- X b) for SHIFT-IN sequence: the previous input character
- X set number is restored, (i.e., the one which preceded
- X the SHIFT-OUT sequence for the current set). Note
- X that only the SHIFT-IN sequence for the current set
- X is matched. The SHIFT-IN sequences for other charac-
- X ter sets than the current set are not matched. The
- X bracketing of sets is assumed perfect. If the SHIFT-
- X IN sequence for the current set is an empty string,
- X the input set number is changed when SHIFT-OUT
- X
- END_OF_FILE
- if test 34820 -ne `wc -c <'translit.txt.A'`; then
- echo shar: \"'translit.txt.A'\" unpacked with wrong size!
- elif test -f 'translit.txt.B'; then
- echo shar: Combining \"'translit.txt'\" \(68330 characters\)
- cat 'translit.txt.A' 'translit.txt.B' > 'translit.txt'
- if test 68330 -ne `wc -c <'translit.txt'`; then
- echo shar: \"'translit.txt'\" combined with wrong size!
- else
- rm translit.txt.A translit.txt.B
- fi
- fi
- # end of 'translit.txt.A'
- fi
- echo shar: End of archive 6 \(of 10\).
- cp /dev/null ark6isdone
- MISSING=""
- for I in 1 2 3 4 5 6 7 8 9 10 ; do
- if test ! -f ark${I}isdone ; then
- MISSING="${MISSING} ${I}"
- fi
- done
- if test "${MISSING}" = "" ; then
- echo You have unpacked all 10 archives.
- rm -f ark[1-9]isdone ark[1-9][0-9]isdone
- else
- echo You still must unpack the following archives:
- echo " " ${MISSING}
- fi
- exit 0
- exit 0 # Just in case...
-