Файловый менеджер - Редактировать - /home/lakoyani/lakoyani.com.fj/I18N.tar
Назад
LangTags/Detect.pm 0000644 00000014620 14711200716 0010014 0 ustar 00 # Time-stamp: "2004-06-20 21:47:55 ADT" require 5; package I18N::LangTags::Detect; use strict; use vars qw( @ISA $VERSION $MATCH_SUPERS $USING_LANGUAGE_TAGS $USE_LITERALS $MATCH_SUPERS_TIGHTLY); BEGIN { unless(defined &DEBUG) { *DEBUG = sub () {0} } } # define the constant 'DEBUG' at compile-time $VERSION = "1.05"; @ISA = (); use I18N::LangTags qw(alternate_language_tags locale2language_tag); sub _uniq { my %seen; return grep(!($seen{$_}++), @_); } sub _normalize { my(@languages) = map lc($_), grep $_, map {; $_, alternate_language_tags($_) } @_; return _uniq(@languages) if wantarray; return $languages[0]; } #--------------------------------------------------------------------------- # The extent of our functional interface: sub detect () { return __PACKAGE__->ambient_langprefs; } #=========================================================================== sub ambient_langprefs { # always returns things untainted my $base_class = $_[0]; return $base_class->http_accept_langs if length( $ENV{'REQUEST_METHOD'} || '' ); # I'm a CGI # it's off in its own routine because it's complicated # Not running as a CGI: try to puzzle out from the environment my @languages; foreach my $envname (qw( LANGUAGE LC_ALL LC_MESSAGES LANG )) { next unless $ENV{$envname}; DEBUG and print "Noting \$$envname: $ENV{$envname}\n"; push @languages, map locale2language_tag($_), # if it's a lg tag, fine, pass thru (untainted) # if it's a locale ID, try converting to a lg tag (untainted), # otherwise nix it. split m/[,:]/, $ENV{$envname} ; last; # first one wins } if($ENV{'IGNORE_WIN32_LOCALE'}) { # no-op } elsif(&_try_use('Win32::Locale')) { # If we have that module installed... push @languages, Win32::Locale::get_language() || '' if defined &Win32::Locale::get_language; } return _normalize @languages; } #--------------------------------------------------------------------------- sub http_accept_langs { # Deal with HTTP "Accept-Language:" stuff. Hassle. # This code is more lenient than RFC 3282, which you must read. # Hm. Should I just move this into I18N::LangTags at some point? no integer; my $in = (@_ > 1) ? $_[1] : $ENV{'HTTP_ACCEPT_LANGUAGE'}; # (always ends up untainting) return() unless defined $in and length $in; $in =~ s/\([^\)]*\)//g; # nix just about any comment if( $in =~ m/^\s*([a-zA-Z][-a-zA-Z]+)\s*$/s ) { # Very common case: just one language tag return _normalize $1; } elsif( $in =~ m/^\s*[a-zA-Z][-a-zA-Z]+(?:\s*,\s*[a-zA-Z][-a-zA-Z]+)*\s*$/s ) { # Common case these days: just "foo, bar, baz" return _normalize( $in =~ m/([a-zA-Z][-a-zA-Z]+)/g ); } # Else it's complicated... $in =~ s/\s+//g; # Yes, we can just do without the WS! my @in = $in =~ m/([^,]+)/g; my %pref; my $q; foreach my $tag (@in) { next unless $tag =~ m/^([a-zA-Z][-a-zA-Z]+) (?: ;q= ( \d* # a bit too broad of a RE, but so what. (?: \.\d+ )? ) )? $ /sx ; $q = (defined $2 and length $2) ? $2 : 1; #print "$1 with q=$q\n"; push @{ $pref{$q} }, lc $1; } return _normalize( # Read off %pref, in descending key order... map @{$pref{$_}}, sort {$b <=> $a} keys %pref ); } #=========================================================================== my %tried = (); # memoization of whether we've used this module, or found it unusable. sub _try_use { # Basically a wrapper around "require Modulename" # "Many men have tried..." "They tried and failed?" "They tried and died." return $tried{$_[0]} if exists $tried{$_[0]}; # memoization my $module = $_[0]; # ASSUME sane module name! { no strict 'refs'; no warnings 'once'; return($tried{$module} = 1) if %{$module . "::Lexicon"} or @{$module . "::ISA"}; # weird case: we never use'd it, but there it is! } print " About to use $module ...\n" if DEBUG; { local $SIG{'__DIE__'}; eval "require $module"; # used to be "use $module", but no point in that. } if($@) { print "Error using $module \: $@\n" if DEBUG > 1; return $tried{$module} = 0; } else { print " OK, $module is used\n" if DEBUG; return $tried{$module} = 1; } } #--------------------------------------------------------------------------- 1; __END__ =head1 NAME I18N::LangTags::Detect - detect the user's language preferences =head1 SYNOPSIS use I18N::LangTags::Detect; my @user_wants = I18N::LangTags::Detect::detect(); =head1 DESCRIPTION It is a common problem to want to detect what language(s) the user would prefer output in. =head1 FUNCTIONS This module defines one public function, C<I18N::LangTags::Detect::detect()>. This function is not exported (nor is even exportable), and it takes no parameters. In scalar context, the function returns the most preferred language tag (or undef if no preference was seen). In list context (which is usually what you want), the function returns a (possibly empty) list of language tags representing (best first) what languages the user apparently would accept output in. You will probably want to pass the output of this through C<I18N::LangTags::implicate_supers_tightly(...)> or C<I18N::LangTags::implicate_supers(...)>, like so: my @languages = I18N::LangTags::implicate_supers_tightly( I18N::LangTags::Detect::detect() ); =head1 ENVIRONMENT This module looks for several environment variables, including REQUEST_METHOD, HTTP_ACCEPT_LANGUAGE, LANGUAGE, LC_ALL, LC_MESSAGES, and LANG. It will also use the L<Win32::Locale> module, if it's installed. =head1 SEE ALSO L<I18N::LangTags>, L<Win32::Locale>, L<Locale::Maketext>. (This module's core code started out as a routine in Locale::Maketext; but I moved it here once I realized it was more generally useful.) =head1 COPYRIGHT Copyright (c) 1998-2004 Sean M. Burke. All rights reserved. This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. The programs and documentation in this dist are distributed in the hope that they will be useful, but without any warranty; without even the implied warranty of merchantability or fitness for a particular purpose. =head1 AUTHOR Sean M. Burke C<sburke@cpan.org> =cut # a tip: Put a bit of chopped up pickled ginger in your salad. It's tasty! LangTags/List.pm 0000644 00000070235 14711200716 0007523 0 ustar 00 require 5; package I18N::LangTags::List; # Time-stamp: "2004-10-06 23:26:21 ADT" use strict; use vars qw(%Name %Is_Disrec $Debug $VERSION); $VERSION = '0.35_01'; # POD at the end. #---------------------------------------------------------------------- { # read the table out of our own POD! my $seeking = 1; my $count = 0; my($disrec,$tag,$name); my $last_name = ''; while(<I18N::LangTags::List::DATA>) { if($seeking) { $seeking = 0 if m/=for woohah/; } elsif( ($disrec, $tag, $name) = m/(\[?)\{([-0-9a-zA-Z]+)\}(?:\s*:)?\s*([^\[\]]+)/ ) { $name =~ s/\s*[;\.]*\s*$//g; next unless $name; ++$count; print "<$tag> <$name>\n" if $Debug; $last_name = $Name{$tag} = $name; $Is_Disrec{$tag} = 1 if $disrec; } elsif (m/[Ff]ormerly \"([-a-z0-9]+)\"/) { $Name{$1} = "$last_name (old tag)" if $last_name; $Is_Disrec{$1} = 1; } } die "No tags read??" unless $count; } #---------------------------------------------------------------------- sub name { my $tag = lc($_[0] || return); $tag =~ s/^\s+//s; $tag =~ s/\s+$//s; my $alt; if($tag =~ m/^x-(.+)/) { $alt = "i-$1"; } elsif($tag =~ m/^i-(.+)/) { $alt = "x-$1"; } else { $alt = ''; } my $subform = ''; my $name = ''; print "Input: {$tag}\n" if $Debug; while(length $tag) { last if $name = $Name{$tag}; last if $name = $Name{$alt}; if($tag =~ s/(-[a-z0-9]+)$//s) { print "Shaving off: $1 leaving $tag\n" if $Debug; $subform = "$1$subform"; # and loop around again $alt =~ s/(-[a-z0-9]+)$//s && $Debug && print " alt -> $alt\n"; } else { # we're trying to pull a subform off a primary tag. TILT! print "Aborting on: {$name}{$subform}\n" if $Debug; last; } } print "Output: {$name}{$subform}\n" if $Debug; return unless $name; # Failure return $name unless $subform; # Exact match $subform =~ s/^-//s; $subform =~ s/-$//s; return "$name (Subform \"$subform\")"; } #-------------------------------------------------------------------------- sub is_decent { my $tag = lc($_[0] || return 0); #require I18N::LangTags; return 0 unless $tag =~ /^(?: # First subtag [xi] | [a-z]{2,3} ) (?: # Subtags thereafter - # separator [a-z0-9]{1,8} # subtag )* $/xs; my @supers = (); foreach my $bit (split('-', $tag)) { push @supers, scalar(@supers) ? ($supers[-1] . '-' . $bit) : $bit; } return 0 unless @supers; shift @supers if $supers[0] =~ m<^(i|x|sgn)$>s; return 0 unless @supers; foreach my $f ($tag, @supers) { return 0 if $Is_Disrec{$f}; return 2 if $Name{$f}; # so that decent subforms of indecent tags are decent } return 2 if $Name{$tag}; # not only is it decent, it's known! return 1; } #-------------------------------------------------------------------------- 1; __DATA__ =head1 NAME I18N::LangTags::List -- tags and names for human languages =head1 SYNOPSIS use I18N::LangTags::List; print "Parlez-vous... ", join(', ', I18N::LangTags::List::name('elx') || 'unknown_language', I18N::LangTags::List::name('ar-Kw') || 'unknown_language', I18N::LangTags::List::name('en') || 'unknown_language', I18N::LangTags::List::name('en-CA') || 'unknown_language', ), "?\n"; prints: Parlez-vous... Elamite, Kuwait Arabic, English, Canadian English? =head1 DESCRIPTION This module provides a function C<I18N::LangTags::List::name( I<langtag> ) > that takes a language tag (see L<I18N::LangTags|I18N::LangTags>) and returns the best attempt at an English name for it, or undef if it can't make sense of the tag. The function I18N::LangTags::List::name(...) is not exported. This module also provides a function C<I18N::LangTags::List::is_decent( I<langtag> )> that returns true iff the language tag is syntactically valid and is for general use (like "fr" or "fr-ca", below). That is, it returns false for tags that are syntactically invalid and for tags, like "aus", that are listed in brackets below. This function is not exported. The map of tags-to-names that it uses is accessible as %I18N::LangTags::List::Name, and it's the same as the list that follows in this documentation, which should be useful to you even if you don't use this module. =head1 ABOUT LANGUAGE TAGS Internet language tags, as defined in RFC 3066, are a formalism for denoting human languages. The two-letter ISO 639-1 language codes are well known (as "en" for English), as are their forms when qualified by a country code ("en-US"). Less well-known are the arbitrary-length non-ISO codes (like "i-mingo"), and the recently (in 2001) introduced three-letter ISO-639-2 codes. Remember these important facts: =over =item * Language tags are not locale IDs. A locale ID is written with a "_" instead of a "-", (almost?) always matches C<m/^\w\w_\w\w\b/>, and I<means> something different than a language tag. A language tag denotes a language. A locale ID denotes a language I<as used in> a particular place, in combination with non-linguistic location-specific information such as what currency is used there. Locales I<also> often denote character set information, as in "en_US.ISO8859-1". =item * Language tags are not for computer languages. =item * "Dialect" is not a useful term, since there is no objective criterion for establishing when two language-forms are dialects of eachother, or are separate languages. =item * Language tags are not case-sensitive. en-US, en-us, En-Us, etc., are all the same tag, and denote the same language. =item * Not every language tag really refers to a single language. Some language tags refer to conditions: i-default (system-message text in English plus maybe other languages), und (undetermined language). Others (notably lots of the three-letter codes) are bibliographic tags that classify whole groups of languages, as with cus "Cushitic (Other)" (i.e., a language that has been classed as Cushtic, but which has no more specific code) or the even less linguistically coherent sai for "South American Indian (Other)". Though useful in bibliography, B<SUCH TAGS ARE NOT FOR GENERAL USE>. For further guidance, email me. =item * Language tags are not country codes. In fact, they are often distinct codes, as with language tag ja for Japanese, and ISO 3166 country code C<.jp> for Japan. =back =head1 LIST OF LANGUAGES The first part of each item is the language tag, between {...}. It is followed by an English name for the language or language-group. Language tags that I judge to be not for general use, are bracketed. This list is in alphabetical order by English name of the language. =for reminder The name in the =item line MUST NOT have E<...>'s in it!! =for woohah START =over =item {ab} : Abkhazian eq Abkhaz =item {ace} : Achinese =item {ach} : Acoli =item {ada} : Adangme =item {ady} : Adyghe eq Adygei =item {aa} : Afar =item {afh} : Afrihili (Artificial) =item {af} : Afrikaans =item [{afa} : Afro-Asiatic (Other)] =item {ak} : Akan (Formerly "aka".) =item {akk} : Akkadian (Historical) =item {sq} : Albanian =item {ale} : Aleut =item [{alg} : Algonquian languages] NOT Algonquin! =item [{tut} : Altaic (Other)] =item {am} : Amharic NOT Aramaic! =item {i-ami} : Ami eq Amis. eq 'Amis. eq Pangca. =item [{apa} : Apache languages] =item {ar} : Arabic Many forms are mutually un-intelligible in spoken media. Notable forms: {ar-ae} UAE Arabic; {ar-bh} Bahrain Arabic; {ar-dz} Algerian Arabic; {ar-eg} Egyptian Arabic; {ar-iq} Iraqi Arabic; {ar-jo} Jordanian Arabic; {ar-kw} Kuwait Arabic; {ar-lb} Lebanese Arabic; {ar-ly} Libyan Arabic; {ar-ma} Moroccan Arabic; {ar-om} Omani Arabic; {ar-qa} Qatari Arabic; {ar-sa} Sauda Arabic; {ar-sy} Syrian Arabic; {ar-tn} Tunisian Arabic; {ar-ye} Yemen Arabic. =item {arc} : Aramaic NOT Amharic! NOT Samaritan Aramaic! =item {arp} : Arapaho =item {arn} : Araucanian =item {arw} : Arawak =item {hy} : Armenian =item {an} : Aragonese =item [{art} : Artificial (Other)] =item {ast} : Asturian eq Bable. =item {as} : Assamese =item [{ath} : Athapascan languages] eq Athabaskan. eq Athapaskan. eq Athabascan. =item [{aus} : Australian languages] =item [{map} : Austronesian (Other)] =item {av} : Avaric (Formerly "ava".) =item {ae} : Avestan eq Zend =item {awa} : Awadhi =item {ay} : Aymara =item {az} : Azerbaijani eq Azeri Notable forms: {az-Arab} Azerbaijani in Arabic script; {az-Cyrl} Azerbaijani in Cyrillic script; {az-Latn} Azerbaijani in Latin script. =item {ban} : Balinese =item [{bat} : Baltic (Other)] =item {bal} : Baluchi =item {bm} : Bambara (Formerly "bam".) =item [{bai} : Bamileke languages] =item {bad} : Banda =item [{bnt} : Bantu (Other)] =item {bas} : Basa =item {ba} : Bashkir =item {eu} : Basque =item {btk} : Batak (Indonesia) =item {bej} : Beja =item {be} : Belarusian eq Belarussian. eq Byelarussian. eq Belorussian. eq Byelorussian. eq White Russian. eq White Ruthenian. NOT Ruthenian! =item {bem} : Bemba =item {bn} : Bengali eq Bangla. =item [{ber} : Berber (Other)] =item {bho} : Bhojpuri =item {bh} : Bihari =item {bik} : Bikol =item {bin} : Bini =item {bi} : Bislama eq Bichelamar. =item {bs} : Bosnian =item {bra} : Braj =item {br} : Breton =item {bug} : Buginese =item {bg} : Bulgarian =item {i-bnn} : Bunun =item {bua} : Buriat =item {my} : Burmese =item {cad} : Caddo =item {car} : Carib =item {ca} : Catalan eq CatalE<aacute>n. eq Catalonian. =item [{cau} : Caucasian (Other)] =item {ceb} : Cebuano =item [{cel} : Celtic (Other)] Notable forms: {cel-gaulish} Gaulish (Historical) =item [{cai} : Central American Indian (Other)] =item {chg} : Chagatai (Historical?) =item [{cmc} : Chamic languages] =item {ch} : Chamorro =item {ce} : Chechen =item {chr} : Cherokee eq Tsalagi =item {chy} : Cheyenne =item {chb} : Chibcha (Historical) NOT Chibchan (which is a language family). =item {ny} : Chichewa eq Nyanja. eq Chinyanja. =item {zh} : Chinese Many forms are mutually un-intelligible in spoken media. Notable forms: {zh-Hans} Chinese, in simplified script; {zh-Hant} Chinese, in traditional script; {zh-tw} Taiwan Chinese; {zh-cn} PRC Chinese; {zh-sg} Singapore Chinese; {zh-mo} Macau Chinese; {zh-hk} Hong Kong Chinese; {zh-guoyu} Mandarin [Putonghua/Guoyu]; {zh-hakka} Hakka [formerly "i-hakka"]; {zh-min} Hokkien; {zh-min-nan} Southern Hokkien; {zh-wuu} Shanghaiese; {zh-xiang} Hunanese; {zh-gan} Gan; {zh-yue} Cantonese. =for etc {i-hakka} Hakka (old tag) =item {chn} : Chinook Jargon eq Chinook Wawa. =item {chp} : Chipewyan =item {cho} : Choctaw =item {cu} : Church Slavic eq Old Church Slavonic. =item {chk} : Chuukese eq Trukese. eq Chuuk. eq Truk. eq Ruk. =item {cv} : Chuvash =item {cop} : Coptic =item {kw} : Cornish =item {co} : Corsican eq Corse. =item {cr} : Cree NOT Creek! (Formerly "cre".) =item {mus} : Creek NOT Cree! =item [{cpe} : English-based Creoles and pidgins (Other)] =item [{cpf} : French-based Creoles and pidgins (Other)] =item [{cpp} : Portuguese-based Creoles and pidgins (Other)] =item [{crp} : Creoles and pidgins (Other)] =item {hr} : Croatian eq Croat. =item [{cus} : Cushitic (Other)] =item {cs} : Czech =item {dak} : Dakota eq Nakota. eq Latoka. =item {da} : Danish =item {dar} : Dargwa =item {day} : Dayak =item {i-default} : Default (Fallthru) Language Defined in RFC 2277, this is for tagging text (which must include English text, and might/should include text in other appropriate languages) that is emitted in a context where language-negotiation wasn't possible -- in SMTP mail failure messages, for example. =item {del} : Delaware =item {din} : Dinka =item {dv} : Divehi eq Maldivian. (Formerly "div".) =item {doi} : Dogri NOT Dogrib! =item {dgr} : Dogrib NOT Dogri! =item [{dra} : Dravidian (Other)] =item {dua} : Duala =item {nl} : Dutch eq Netherlander. Notable forms: {nl-nl} Netherlands Dutch; {nl-be} Belgian Dutch. =item {dum} : Middle Dutch (ca.1050-1350) (Historical) =item {dyu} : Dyula =item {dz} : Dzongkha =item {efi} : Efik =item {egy} : Ancient Egyptian (Historical) =item {eka} : Ekajuk =item {elx} : Elamite (Historical) =item {en} : English Notable forms: {en-au} Australian English; {en-bz} Belize English; {en-ca} Canadian English; {en-gb} UK English; {en-ie} Irish English; {en-jm} Jamaican English; {en-nz} New Zealand English; {en-ph} Philippine English; {en-tt} Trinidad English; {en-us} US English; {en-za} South African English; {en-zw} Zimbabwe English. =item {enm} : Old English (1100-1500) (Historical) =item {ang} : Old English (ca.450-1100) eq Anglo-Saxon. (Historical) =item {i-enochian} : Enochian (Artificial) =item {myv} : Erzya =item {eo} : Esperanto (Artificial) =item {et} : Estonian =item {ee} : Ewe (Formerly "ewe".) =item {ewo} : Ewondo =item {fan} : Fang =item {fat} : Fanti =item {fo} : Faroese =item {fj} : Fijian =item {fi} : Finnish =item [{fiu} : Finno-Ugrian (Other)] eq Finno-Ugric. NOT Ugaritic! =item {fon} : Fon =item {fr} : French Notable forms: {fr-fr} France French; {fr-be} Belgian French; {fr-ca} Canadian French; {fr-ch} Swiss French; {fr-lu} Luxembourg French; {fr-mc} Monaco French. =item {frm} : Middle French (ca.1400-1600) (Historical) =item {fro} : Old French (842-ca.1400) (Historical) =item {fy} : Frisian =item {fur} : Friulian =item {ff} : Fulah (Formerly "ful".) =item {gaa} : Ga =item {gd} : Scots Gaelic NOT Scots! =item {gl} : Gallegan eq Galician =item {lg} : Ganda (Formerly "lug".) =item {gay} : Gayo =item {gba} : Gbaya =item {gez} : Geez eq Ge'ez =item {ka} : Georgian =item {de} : German Notable forms: {de-at} Austrian German; {de-be} Belgian German; {de-ch} Swiss German; {de-de} Germany German; {de-li} Liechtenstein German; {de-lu} Luxembourg German. =item {gmh} : Middle High German (ca.1050-1500) (Historical) =item {goh} : Old High German (ca.750-1050) (Historical) =item [{gem} : Germanic (Other)] =item {gil} : Gilbertese =item {gon} : Gondi =item {gor} : Gorontalo =item {got} : Gothic (Historical) =item {grb} : Grebo =item {grc} : Ancient Greek (Historical) (Until 15th century or so.) =item {el} : Modern Greek (Since 15th century or so.) =item {gn} : Guarani GuaranE<iacute> =item {gu} : Gujarati =item {gwi} : Gwich'in eq Gwichin =item {hai} : Haida =item {ht} : Haitian eq Haitian Creole =item {ha} : Hausa =item {haw} : Hawaiian Hawai'ian =item {he} : Hebrew (Formerly "iw".) =for etc {iw} Hebrew (old tag) =item {hz} : Herero =item {hil} : Hiligaynon =item {him} : Himachali =item {hi} : Hindi =item {ho} : Hiri Motu =item {hit} : Hittite (Historical) =item {hmn} : Hmong =item {hu} : Hungarian =item {hup} : Hupa =item {iba} : Iban =item {is} : Icelandic =item {io} : Ido (Artificial) =item {ig} : Igbo (Formerly "ibo".) =item {ijo} : Ijo =item {ilo} : Iloko =item [{inc} : Indic (Other)] =item [{ine} : Indo-European (Other)] =item {id} : Indonesian (Formerly "in".) =for etc {in} Indonesian (old tag) =item {inh} : Ingush =item {ia} : Interlingua (International Auxiliary Language Association) (Artificial) NOT Interlingue! =item {ie} : Interlingue (Artificial) NOT Interlingua! =item {iu} : Inuktitut A subform of "Eskimo". =item {ik} : Inupiaq A subform of "Eskimo". =item [{ira} : Iranian (Other)] =item {ga} : Irish =item {mga} : Middle Irish (900-1200) (Historical) =item {sga} : Old Irish (to 900) (Historical) =item [{iro} : Iroquoian languages] =item {it} : Italian Notable forms: {it-it} Italy Italian; {it-ch} Swiss Italian. =item {ja} : Japanese (NOT "jp"!) =item {jv} : Javanese (Formerly "jw" because of a typo.) =item {jrb} : Judeo-Arabic =item {jpr} : Judeo-Persian =item {kbd} : Kabardian =item {kab} : Kabyle =item {kac} : Kachin =item {kl} : Kalaallisut eq Greenlandic "Eskimo" =item {xal} : Kalmyk =item {kam} : Kamba =item {kn} : Kannada eq Kanarese. NOT Canadian! =item {kr} : Kanuri (Formerly "kau".) =item {krc} : Karachay-Balkar =item {kaa} : Kara-Kalpak =item {kar} : Karen =item {ks} : Kashmiri =item {csb} : Kashubian eq Kashub =item {kaw} : Kawi =item {kk} : Kazakh =item {kha} : Khasi =item {km} : Khmer eq Cambodian. eq Kampuchean. =item [{khi} : Khoisan (Other)] =item {kho} : Khotanese =item {ki} : Kikuyu eq Gikuyu. =item {kmb} : Kimbundu =item {rw} : Kinyarwanda =item {ky} : Kirghiz =item {i-klingon} : Klingon =item {kv} : Komi =item {kg} : Kongo (Formerly "kon".) =item {kok} : Konkani =item {ko} : Korean =item {kos} : Kosraean =item {kpe} : Kpelle =item {kro} : Kru =item {kj} : Kuanyama =item {kum} : Kumyk =item {ku} : Kurdish =item {kru} : Kurukh =item {kut} : Kutenai =item {lad} : Ladino eq Judeo-Spanish. NOT Ladin (a minority language in Italy). =item {lah} : Lahnda NOT Lamba! =item {lam} : Lamba NOT Lahnda! =item {lo} : Lao eq Laotian. =item {la} : Latin (Historical) NOT Ladin! NOT Ladino! =item {lv} : Latvian eq Lettish. =item {lb} : Letzeburgesch eq Luxemburgian, eq Luxemburger. (Formerly "i-lux".) =for etc {i-lux} Letzeburgesch (old tag) =item {lez} : Lezghian =item {li} : Limburgish eq Limburger, eq Limburgan. NOT Letzeburgesch! =item {ln} : Lingala =item {lt} : Lithuanian =item {nds} : Low German eq Low Saxon. eq Low German. eq Low Saxon. =item {art-lojban} : Lojban (Artificial) =item {loz} : Lozi =item {lu} : Luba-Katanga (Formerly "lub".) =item {lua} : Luba-Lulua =item {lui} : Luiseno eq LuiseE<ntilde>o. =item {lun} : Lunda =item {luo} : Luo (Kenya and Tanzania) =item {lus} : Lushai =item {mk} : Macedonian eq the modern Slavic language spoken in what was Yugoslavia. NOT the form of Greek spoken in Greek Macedonia! =item {mad} : Madurese =item {mag} : Magahi =item {mai} : Maithili =item {mak} : Makasar =item {mg} : Malagasy =item {ms} : Malay NOT Malayalam! =item {ml} : Malayalam NOT Malay! =item {mt} : Maltese =item {mnc} : Manchu =item {mdr} : Mandar NOT Mandarin! =item {man} : Mandingo =item {mni} : Manipuri eq Meithei. =item [{mno} : Manobo languages] =item {gv} : Manx =item {mi} : Maori NOT Mari! =item {mr} : Marathi =item {chm} : Mari NOT Maori! =item {mh} : Marshall eq Marshallese. =item {mwr} : Marwari =item {mas} : Masai =item [{myn} : Mayan languages] =item {men} : Mende =item {mic} : Micmac =item {min} : Minangkabau =item {i-mingo} : Mingo eq the Irquoian language West Virginia Seneca. NOT New York Seneca! =item [{mis} : Miscellaneous languages] Don't use this. =item {moh} : Mohawk =item {mdf} : Moksha =item {mo} : Moldavian eq Moldovan. =item [{mkh} : Mon-Khmer (Other)] =item {lol} : Mongo =item {mn} : Mongolian eq Mongol. =item {mos} : Mossi =item [{mul} : Multiple languages] Not for normal use. =item [{mun} : Munda languages] =item {nah} : Nahuatl =item {nap} : Neapolitan =item {na} : Nauru =item {nv} : Navajo eq Navaho. (Formerly "i-navajo".) =for etc {i-navajo} Navajo (old tag) =item {nd} : North Ndebele =item {nr} : South Ndebele =item {ng} : Ndonga =item {ne} : Nepali eq Nepalese. Notable forms: {ne-np} Nepal Nepali; {ne-in} India Nepali. =item {new} : Newari =item {nia} : Nias =item [{nic} : Niger-Kordofanian (Other)] =item [{ssa} : Nilo-Saharan (Other)] =item {niu} : Niuean =item {nog} : Nogai =item {non} : Old Norse (Historical) =item [{nai} : North American Indian] Do not use this. =item {no} : Norwegian Note the two following forms: =item {nb} : Norwegian Bokmal eq BokmE<aring>l, (A form of Norwegian.) (Formerly "no-bok".) =for etc {no-bok} Norwegian Bokmal (old tag) =item {nn} : Norwegian Nynorsk (A form of Norwegian.) (Formerly "no-nyn".) =for etc {no-nyn} Norwegian Nynorsk (old tag) =item [{nub} : Nubian languages] =item {nym} : Nyamwezi =item {nyn} : Nyankole =item {nyo} : Nyoro =item {nzi} : Nzima =item {oc} : Occitan (post 1500) eq ProvenE<ccedil>al, eq Provencal =item {oj} : Ojibwa eq Ojibwe. (Formerly "oji".) =item {or} : Oriya =item {om} : Oromo =item {osa} : Osage =item {os} : Ossetian; Ossetic =item [{oto} : Otomian languages] Group of languages collectively called "OtomE<iacute>". =item {pal} : Pahlavi eq Pahlevi =item {i-pwn} : Paiwan eq Pariwan =item {pau} : Palauan =item {pi} : Pali (Historical?) =item {pam} : Pampanga =item {pag} : Pangasinan =item {pa} : Panjabi eq Punjabi =item {pap} : Papiamento eq Papiamentu. =item [{paa} : Papuan (Other)] =item {fa} : Persian eq Farsi. eq Iranian. =item {peo} : Old Persian (ca.600-400 B.C.) =item [{phi} : Philippine (Other)] =item {phn} : Phoenician (Historical) =item {pon} : Pohnpeian NOT Pompeiian! =item {pl} : Polish =item {pt} : Portuguese eq Portugese. Notable forms: {pt-pt} Portugal Portuguese; {pt-br} Brazilian Portuguese. =item [{pra} : Prakrit languages] =item {pro} : Old Provencal (to 1500) eq Old ProvenE<ccedil>al. (Historical.) =item {ps} : Pushto eq Pashto. eq Pushtu. =item {qu} : Quechua eq Quecha. =item {rm} : Raeto-Romance eq Romansh. =item {raj} : Rajasthani =item {rap} : Rapanui =item {rar} : Rarotongan =item [{qaa - qtz} : Reserved for local use.] =item [{roa} : Romance (Other)] NOT Romanian! NOT Romany! NOT Romansh! =item {ro} : Romanian eq Rumanian. NOT Romany! =item {rom} : Romany eq Rom. NOT Romanian! =item {rn} : Rundi =item {ru} : Russian NOT White Russian! NOT Rusyn! =item [{sal} : Salishan languages] Large language group. =item {sam} : Samaritan Aramaic NOT Aramaic! =item {se} : Northern Sami eq Lappish. eq Lapp. eq (Northern) Saami. =item {sma} : Southern Sami =item {smn} : Inari Sami =item {smj} : Lule Sami =item {sms} : Skolt Sami =item [{smi} : Sami languages (Other)] =item {sm} : Samoan =item {sad} : Sandawe =item {sg} : Sango =item {sa} : Sanskrit (Historical) =item {sat} : Santali =item {sc} : Sardinian eq Sard. =item {sas} : Sasak =item {sco} : Scots NOT Scots Gaelic! =item {sel} : Selkup =item [{sem} : Semitic (Other)] =item {sr} : Serbian eq Serb. NOT Sorbian. Notable forms: {sr-Cyrl} : Serbian in Cyrillic script; {sr-Latn} : Serbian in Latin script. =item {srr} : Serer =item {shn} : Shan =item {sn} : Shona =item {sid} : Sidamo =item {sgn-...} : Sign Languages Always use with a subtag. Notable forms: {sgn-gb} British Sign Language (BSL); {sgn-ie} Irish Sign Language (ESL); {sgn-ni} Nicaraguan Sign Language (ISN); {sgn-us} American Sign Language (ASL). (And so on with other country codes as the subtag.) =item {bla} : Siksika eq Blackfoot. eq Pikanii. =item {sd} : Sindhi =item {si} : Sinhalese eq Sinhala. =item [{sit} : Sino-Tibetan (Other)] =item [{sio} : Siouan languages] =item {den} : Slave (Athapascan) ("Slavey" is a subform.) =item [{sla} : Slavic (Other)] =item {sk} : Slovak eq Slovakian. =item {sl} : Slovenian eq Slovene. =item {sog} : Sogdian =item {so} : Somali =item {son} : Songhai =item {snk} : Soninke =item {wen} : Sorbian languages eq Wendish. eq Sorb. eq Lusatian. eq Wend. NOT Venda! NOT Serbian! =item {nso} : Northern Sotho =item {st} : Southern Sotho eq Sutu. eq Sesotho. =item [{sai} : South American Indian (Other)] =item {es} : Spanish Notable forms: {es-ar} Argentine Spanish; {es-bo} Bolivian Spanish; {es-cl} Chilean Spanish; {es-co} Colombian Spanish; {es-do} Dominican Spanish; {es-ec} Ecuadorian Spanish; {es-es} Spain Spanish; {es-gt} Guatemalan Spanish; {es-hn} Honduran Spanish; {es-mx} Mexican Spanish; {es-pa} Panamanian Spanish; {es-pe} Peruvian Spanish; {es-pr} Puerto Rican Spanish; {es-py} Paraguay Spanish; {es-sv} Salvadoran Spanish; {es-us} US Spanish; {es-uy} Uruguayan Spanish; {es-ve} Venezuelan Spanish. =item {suk} : Sukuma =item {sux} : Sumerian (Historical) =item {su} : Sundanese =item {sus} : Susu =item {sw} : Swahili eq Kiswahili =item {ss} : Swati =item {sv} : Swedish Notable forms: {sv-se} Sweden Swedish; {sv-fi} Finland Swedish. =item {syr} : Syriac =item {tl} : Tagalog =item {ty} : Tahitian =item [{tai} : Tai (Other)] NOT Thai! =item {tg} : Tajik =item {tmh} : Tamashek =item {ta} : Tamil =item {i-tao} : Tao eq Yami. =item {tt} : Tatar =item {i-tay} : Tayal eq Atayal. eq Atayan. =item {te} : Telugu =item {ter} : Tereno =item {tet} : Tetum =item {th} : Thai NOT Tai! =item {bo} : Tibetan =item {tig} : Tigre =item {ti} : Tigrinya =item {tem} : Timne eq Themne. eq Timene. =item {tiv} : Tiv =item {tli} : Tlingit =item {tpi} : Tok Pisin =item {tkl} : Tokelau =item {tog} : Tonga (Nyasa) NOT Tsonga! =item {to} : Tonga (Tonga Islands) (Pronounced "Tong-a", not "Tong-ga") NOT Tsonga! =item {tsi} : Tsimshian eq Sm'algyax =item {ts} : Tsonga NOT Tonga! =item {i-tsu} : Tsou =item {tn} : Tswana Same as Setswana. =item {tum} : Tumbuka =item [{tup} : Tupi languages] =item {tr} : Turkish (Typically in Roman script) =item {ota} : Ottoman Turkish (1500-1928) (Typically in Arabic script) (Historical) =item {crh} : Crimean Turkish eq Crimean Tatar =item {tk} : Turkmen eq Turkmeni. =item {tvl} : Tuvalu =item {tyv} : Tuvinian eq Tuvan. eq Tuvin. =item {tw} : Twi =item {udm} : Udmurt =item {uga} : Ugaritic NOT Ugric! =item {ug} : Uighur =item {uk} : Ukrainian =item {umb} : Umbundu =item {und} : Undetermined Not a tag for normal use. =item {ur} : Urdu =item {uz} : Uzbek eq E<Ouml>zbek Notable forms: {uz-Cyrl} Uzbek in Cyrillic script; {uz-Latn} Uzbek in Latin script. =item {vai} : Vai =item {ve} : Venda NOT Wendish! NOT Wend! NOT Avestan! (Formerly "ven".) =item {vi} : Vietnamese eq Viet. =item {vo} : Volapuk eq VolapE<uuml>k. (Artificial) =item {vot} : Votic eq Votian. eq Vod. =item [{wak} : Wakashan languages] =item {wa} : Walloon =item {wal} : Walamo eq Wolaytta. =item {war} : Waray Presumably the Philippine language Waray-Waray (SamareE<ntilde>o), not the smaller Philippine language Waray Sorsogon, nor the extinct Australian language Waray. =item {was} : Washo eq Washoe =item {cy} : Welsh =item {wo} : Wolof =item {x-...} : Unregistered (Semi-Private Use) "x-" is a prefix for language tags that are not registered with ISO or IANA. Example, x-double-dutch =item {xh} : Xhosa =item {sah} : Yakut =item {yao} : Yao (The Yao in Malawi?) =item {yap} : Yapese eq Yap =item {ii} : Sichuan Yi =item {yi} : Yiddish Formerly "ji". Usually in Hebrew script. Notable forms: {yi-latn} Yiddish in Latin script =item {yo} : Yoruba =item [{ypk} : Yupik languages] Several "Eskimo" languages. =item {znd} : Zande =item [{zap} : Zapotec] (A group of languages.) =item {zen} : Zenaga NOT Zend. =item {za} : Zhuang =item {zu} : Zulu =item {zun} : Zuni eq ZuE<ntilde>i =back =for woohah END =head1 SEE ALSO L<I18N::LangTags|I18N::LangTags> and its "See Also" section. =head1 COPYRIGHT AND DISCLAIMER Copyright (c) 2001+ Sean M. Burke. All rights reserved. You can redistribute and/or modify this document under the same terms as Perl itself. This document is provided in the hope that it will be useful, but without any warranty; without even the implied warranty of accuracy, authoritativeness, completeness, merchantability, or fitness for a particular purpose. Email any corrections or questions to me. =head1 AUTHOR Sean M. Burke, sburkeE<64>cpan.org =cut # To generate a list of just the two and three-letter codes: #!/usr/local/bin/perl -w require 5; # Time-stamp: "2001-03-13 21:53:39 MST" # Sean M. Burke, sburke@cpan.org # This program is for generating the language_codes.txt file use strict; use LWP::Simple; use HTML::TreeBuilder 3.10; my $root = HTML::TreeBuilder->new(); my $url = 'http://lcweb.loc.gov/standards/iso639-2/bibcodes.html'; $root->parse(get($url) || die "Can't get $url"); $root->eof(); my @codes; foreach my $tr ($root->find_by_tag_name('tr')) { my @f = map $_->as_text(), $tr->content_list(); #print map("<$_> ", @f), "\n"; next unless @f == 5; pop @f; # nix the French name next if $f[-1] eq 'Language Name (English)'; # it's a header line my $xx = splice(@f, 2,1); # pull out the two-letter code $f[-1] =~ s/^\s+//; $f[-1] =~ s/\s+$//; if($xx =~ m/[a-zA-Z]/) { # there's a two-letter code for it push @codes, [ lc($f[-1]), "$xx\t$f[-1]\n" ]; } else { # print the three-letter codes. if($f[0] eq $f[1]) { push @codes, [ lc($f[-1]), "$f[1]\t$f[2]\n" ]; } else { # shouldn't happen push @codes, [ lc($f[-1]), "@f !!!!!!!!!!\n" ]; } } } print map $_->[1], sort {; $a->[0] cmp $b->[0] } @codes; print "[ based on $url\n at ", scalar(localtime), "]\n", "[Note: doesn't include IANA-registered codes.]\n"; exit; __END__ LangTags.pm 0000644 00000066651 14711200716 0006617 0 ustar 00 # Time-stamp: "2004-10-06 23:26:33 ADT" # Sean M. Burke <sburke@cpan.org> require 5.000; package I18N::LangTags; use strict; use vars qw(@ISA @EXPORT @EXPORT_OK %EXPORT_TAGS $VERSION %Panic); require Exporter; @ISA = qw(Exporter); @EXPORT = qw(); @EXPORT_OK = qw(is_language_tag same_language_tag extract_language_tags super_languages similarity_language_tag is_dialect_of locale2language_tag alternate_language_tags encode_language_tag panic_languages implicate_supers implicate_supers_strictly ); %EXPORT_TAGS = ('ALL' => \@EXPORT_OK); $VERSION = "0.38"; sub uniq { my %seen; return grep(!($seen{$_}++), @_); } # a util function =head1 NAME I18N::LangTags - functions for dealing with RFC3066-style language tags =head1 SYNOPSIS use I18N::LangTags(); ...or specify whichever of those functions you want to import, like so: use I18N::LangTags qw(implicate_supers similarity_language_tag); All the exportable functions are listed below -- you're free to import only some, or none at all. By default, none are imported. If you say: use I18N::LangTags qw(:ALL) ...then all are exported. (This saves you from having to use something less obvious like C<use I18N::LangTags qw(/./)>.) If you don't import any of these functions, assume a C<&I18N::LangTags::> in front of all the function names in the following examples. =head1 DESCRIPTION Language tags are a formalism, described in RFC 3066 (obsoleting 1766), for declaring what language form (language and possibly dialect) a given chunk of information is in. This library provides functions for common tasks involving language tags as they are needed in a variety of protocols and applications. Please see the "See Also" references for a thorough explanation of how to correctly use language tags. =over =cut ########################################################################### =item * the function is_language_tag($lang1) Returns true iff $lang1 is a formally valid language tag. is_language_tag("fr") is TRUE is_language_tag("x-jicarilla") is FALSE (Subtags can be 8 chars long at most -- 'jicarilla' is 9) is_language_tag("sgn-US") is TRUE (That's American Sign Language) is_language_tag("i-Klikitat") is TRUE (True without regard to the fact noone has actually registered Klikitat -- it's a formally valid tag) is_language_tag("fr-patois") is TRUE (Formally valid -- altho descriptively weak!) is_language_tag("Spanish") is FALSE is_language_tag("french-patois") is FALSE (No good -- first subtag has to match /^([xXiI]|[a-zA-Z]{2,3})$/ -- see RFC3066) is_language_tag("x-borg-prot2532") is TRUE (Yes, subtags can contain digits, as of RFC3066) =cut sub is_language_tag { ## Changes in the language tagging standards may have to be reflected here. my($tag) = lc($_[0]); return 0 if $tag eq "i" or $tag eq "x"; # Bad degenerate cases that the following # regexp would erroneously let pass return $tag =~ /^(?: # First subtag [xi] | [a-z]{2,3} ) (?: # Subtags thereafter - # separator [a-z0-9]{1,8} # subtag )* $/xs ? 1 : 0; } ########################################################################### =item * the function extract_language_tags($whatever) Returns a list of whatever looks like formally valid language tags in $whatever. Not very smart, so don't get too creative with what you want to feed it. extract_language_tags("fr, fr-ca, i-mingo") returns: ('fr', 'fr-ca', 'i-mingo') extract_language_tags("It's like this: I'm in fr -- French!") returns: ('It', 'in', 'fr') (So don't just feed it any old thing.) The output is untainted. If you don't know what tainting is, don't worry about it. =cut sub extract_language_tags { ## Changes in the language tagging standards may have to be reflected here. my($text) = $_[0] =~ m/(.+)/ # to make for an untainted result ? $1 : '' ; return grep(!m/^[ixIX]$/s, # 'i' and 'x' aren't good tags $text =~ m/ \b (?: # First subtag [iIxX] | [a-zA-Z]{2,3} ) (?: # Subtags thereafter - # separator [a-zA-Z0-9]{1,8} # subtag )* \b /xsg ); } ########################################################################### =item * the function same_language_tag($lang1, $lang2) Returns true iff $lang1 and $lang2 are acceptable variant tags representing the same language-form. same_language_tag('x-kadara', 'i-kadara') is TRUE (The x/i- alternation doesn't matter) same_language_tag('X-KADARA', 'i-kadara') is TRUE (...and neither does case) same_language_tag('en', 'en-US') is FALSE (all-English is not the SAME as US English) same_language_tag('x-kadara', 'x-kadar') is FALSE (these are totally unrelated tags) same_language_tag('no-bok', 'nb') is TRUE (no-bok is a legacy tag for nb (Norwegian Bokmal)) C<same_language_tag> works by just seeing whether C<encode_language_tag($lang1)> is the same as C<encode_language_tag($lang2)>. (Yes, I know this function is named a bit oddly. Call it historic reasons.) =cut sub same_language_tag { my $el1 = &encode_language_tag($_[0]); return 0 unless defined $el1; # this avoids the problem of # encode_language_tag($lang1) eq and encode_language_tag($lang2) # being true if $lang1 and $lang2 are both undef return $el1 eq &encode_language_tag($_[1]) ? 1 : 0; } ########################################################################### =item * the function similarity_language_tag($lang1, $lang2) Returns an integer representing the degree of similarity between tags $lang1 and $lang2 (the order of which does not matter), where similarity is the number of common elements on the left, without regard to case and to x/i- alternation. similarity_language_tag('fr', 'fr-ca') is 1 (one element in common) similarity_language_tag('fr-ca', 'fr-FR') is 1 (one element in common) similarity_language_tag('fr-CA-joual', 'fr-CA-PEI') is 2 similarity_language_tag('fr-CA-joual', 'fr-CA') is 2 (two elements in common) similarity_language_tag('x-kadara', 'i-kadara') is 1 (x/i- doesn't matter) similarity_language_tag('en', 'x-kadar') is 0 similarity_language_tag('x-kadara', 'x-kadar') is 0 (unrelated tags -- no similarity) similarity_language_tag('i-cree-syllabic', 'i-cherokee-syllabic') is 0 (no B<leftmost> elements in common!) =cut sub similarity_language_tag { my $lang1 = &encode_language_tag($_[0]); my $lang2 = &encode_language_tag($_[1]); # And encode_language_tag takes care of the whole # no-nyn==nn, i-hakka==zh-hakka, etc, things # NB: (i-sil-...)? (i-sgn-...)? return undef if !defined($lang1) and !defined($lang2); return 0 if !defined($lang1) or !defined($lang2); my @l1_subtags = split('-', $lang1); my @l2_subtags = split('-', $lang2); my $similarity = 0; while(@l1_subtags and @l2_subtags) { if(shift(@l1_subtags) eq shift(@l2_subtags)) { ++$similarity; } else { last; } } return $similarity; } ########################################################################### =item * the function is_dialect_of($lang1, $lang2) Returns true iff language tag $lang1 represents a subform of language tag $lang2. B<Get the order right! It doesn't work the other way around!> is_dialect_of('en-US', 'en') is TRUE (American English IS a dialect of all-English) is_dialect_of('fr-CA-joual', 'fr-CA') is TRUE is_dialect_of('fr-CA-joual', 'fr') is TRUE (Joual is a dialect of (a dialect of) French) is_dialect_of('en', 'en-US') is FALSE (all-English is a NOT dialect of American English) is_dialect_of('fr', 'en-CA') is FALSE is_dialect_of('en', 'en' ) is TRUE is_dialect_of('en-US', 'en-US') is TRUE (B<Note:> these are degenerate cases) is_dialect_of('i-mingo-tom', 'x-Mingo') is TRUE (the x/i thing doesn't matter, nor does case) is_dialect_of('nn', 'no') is TRUE (because 'nn' (New Norse) is aliased to 'no-nyn', as a special legacy case, and 'no-nyn' is a subform of 'no' (Norwegian)) =cut sub is_dialect_of { my $lang1 = &encode_language_tag($_[0]); my $lang2 = &encode_language_tag($_[1]); return undef if !defined($lang1) and !defined($lang2); return 0 if !defined($lang1) or !defined($lang2); return 1 if $lang1 eq $lang2; return 0 if length($lang1) < length($lang2); $lang1 .= '-'; $lang2 .= '-'; return (substr($lang1, 0, length($lang2)) eq $lang2) ? 1 : 0; } ########################################################################### =item * the function super_languages($lang1) Returns a list of language tags that are superordinate tags to $lang1 -- it gets this by removing subtags from the end of $lang1 until nothing (or just "i" or "x") is left. super_languages("fr-CA-joual") is ("fr-CA", "fr") super_languages("en-AU") is ("en") super_languages("en") is empty-list, () super_languages("i-cherokee") is empty-list, () ...not ("i"), which would be illegal as well as pointless. If $lang1 is not a valid language tag, returns empty-list in a list context, undef in a scalar context. A notable and rather unavoidable problem with this method: "x-mingo-tom" has an "x" because the whole tag isn't an IANA-registered tag -- but super_languages('x-mingo-tom') is ('x-mingo') -- which isn't really right, since 'i-mingo' is registered. But this module has no way of knowing that. (But note that same_language_tag('x-mingo', 'i-mingo') is TRUE.) More importantly, you assume I<at your peril> that superordinates of $lang1 are mutually intelligible with $lang1. Consider this carefully. =cut sub super_languages { my $lang1 = $_[0]; return() unless defined($lang1) && &is_language_tag($lang1); # a hack for those annoying new (2001) tags: $lang1 =~ s/^nb\b/no-bok/i; # yes, backwards $lang1 =~ s/^nn\b/no-nyn/i; # yes, backwards $lang1 =~ s/^[ix](-hakka\b)/zh$1/i; # goes the right way # i-hakka-bork-bjork-bjark => zh-hakka-bork-bjork-bjark my @l1_subtags = split('-', $lang1); ## Changes in the language tagging standards may have to be reflected here. # NB: (i-sil-...)? my @supers = (); foreach my $bit (@l1_subtags) { push @supers, scalar(@supers) ? ($supers[-1] . '-' . $bit) : $bit; } pop @supers if @supers; shift @supers if @supers && $supers[0] =~ m<^[iIxX]$>s; return reverse @supers; } ########################################################################### =item * the function locale2language_tag($locale_identifier) This takes a locale name (like "en", "en_US", or "en_US.ISO8859-1") and maps it to a language tag. If it's not mappable (as with, notably, "C" and "POSIX"), this returns empty-list in a list context, or undef in a scalar context. locale2language_tag("en") is "en" locale2language_tag("en_US") is "en-US" locale2language_tag("en_US.ISO8859-1") is "en-US" locale2language_tag("C") is undef or () locale2language_tag("POSIX") is undef or () locale2language_tag("POSIX") is undef or () I'm not totally sure that locale names map satisfactorily to language tags. Think REAL hard about how you use this. YOU HAVE BEEN WARNED. The output is untainted. If you don't know what tainting is, don't worry about it. =cut sub locale2language_tag { my $lang = $_[0] =~ m/(.+)/ # to make for an untainted result ? $1 : '' ; return $lang if &is_language_tag($lang); # like "en" $lang =~ tr<_><->; # "en_US" -> en-US $lang =~ s<(?:[\.\@][-_a-zA-Z0-9]+)+$><>s; # "en_US.ISO8859-1" -> en-US # it_IT.utf8@euro => it-IT return $lang if &is_language_tag($lang); return; } ########################################################################### =item * the function encode_language_tag($lang1) This function, if given a language tag, returns an encoding of it such that: * tags representing different languages never get the same encoding. * tags representing the same language always get the same encoding. * an encoding of a formally valid language tag always is a string value that is defined, has length, and is true if considered as a boolean. Note that the encoding itself is B<not> a formally valid language tag. Note also that you cannot, currently, go from an encoding back to a language tag that it's an encoding of. Note also that you B<must> consider the encoded value as atomic; i.e., you should not consider it as anything but an opaque, unanalysable string value. (The internals of the encoding method may change in future versions, as the language tagging standard changes over time.) C<encode_language_tag> returns undef if given anything other than a formally valid language tag. The reason C<encode_language_tag> exists is because different language tags may represent the same language; this is normally treatable with C<same_language_tag>, but consider this situation: You have a data file that expresses greetings in different languages. Its format is "[language tag]=[how to say 'Hello']", like: en-US=Hiho fr=Bonjour i-mingo=Hau' And suppose you write a program that reads that file and then runs as a daemon, answering client requests that specify a language tag and then expect the string that says how to greet in that language. So an interaction looks like: greeting-client asks: fr greeting-server answers: Bonjour So far so good. But suppose the way you're implementing this is: my %greetings; die unless open(IN, "<in.dat"); while(<IN>) { chomp; next unless /^([^=]+)=(.+)/s; my($lang, $expr) = ($1, $2); $greetings{$lang} = $expr; } close(IN); at which point %greetings has the contents: "en-US" => "Hiho" "fr" => "Bonjour" "i-mingo" => "Hau'" And suppose then that you answer client requests for language $wanted by just looking up $greetings{$wanted}. If the client asks for "fr", that will look up successfully in %greetings, to the value "Bonjour". And if the client asks for "i-mingo", that will look up successfully in %greetings, to the value "Hau'". But if the client asks for "i-Mingo" or "x-mingo", or "Fr", then the lookup in %greetings fails. That's the Wrong Thing. You could instead do lookups on $wanted with: use I18N::LangTags qw(same_language_tag); my $response = ''; foreach my $l2 (keys %greetings) { if(same_language_tag($wanted, $l2)) { $response = $greetings{$l2}; last; } } But that's rather inefficient. A better way to do it is to start your program with: use I18N::LangTags qw(encode_language_tag); my %greetings; die unless open(IN, "<in.dat"); while(<IN>) { chomp; next unless /^([^=]+)=(.+)/s; my($lang, $expr) = ($1, $2); $greetings{ encode_language_tag($lang) } = $expr; } close(IN); and then just answer client requests for language $wanted by just looking up $greetings{encode_language_tag($wanted)} And that does the Right Thing. =cut sub encode_language_tag { # Only similarity_language_tag() is allowed to analyse encodings! ## Changes in the language tagging standards may have to be reflected here. my($tag) = $_[0] || return undef; return undef unless &is_language_tag($tag); # For the moment, these legacy variances are few enough that # we can just handle them here with regexps. $tag =~ s/^iw\b/he/i; # Hebrew $tag =~ s/^in\b/id/i; # Indonesian $tag =~ s/^cre\b/cr/i; # Cree $tag =~ s/^jw\b/jv/i; # Javanese $tag =~ s/^[ix]-lux\b/lb/i; # Luxemburger $tag =~ s/^[ix]-navajo\b/nv/i; # Navajo $tag =~ s/^ji\b/yi/i; # Yiddish # SMB 2003 -- Hm. There's a bunch of new XXX->YY variances now, # but maybe they're all so obscure I can ignore them. "Obscure" # meaning either that the language is obscure, and/or that the # XXX form was extant so briefly that it's unlikely it was ever # used. I hope. # # These go FROM the simplex to complex form, to get # similarity-comparison right. And that's okay, since # similarity_language_tag is the only thing that # analyzes our output. $tag =~ s/^[ix]-hakka\b/zh-hakka/i; # Hakka $tag =~ s/^nb\b/no-bok/i; # BACKWARDS for Bokmal $tag =~ s/^nn\b/no-nyn/i; # BACKWARDS for Nynorsk $tag =~ s/^[xiXI]-//s; # Just lop off any leading "x/i-" return "~" . uc($tag); } #-------------------------------------------------------------------------- =item * the function alternate_language_tags($lang1) This function, if given a language tag, returns all language tags that are alternate forms of this language tag. (I.e., tags which refer to the same language.) This is meant to handle legacy tags caused by the minor changes in language tag standards over the years; and the x-/i- alternation is also dealt with. Note that this function does I<not> try to equate new (and never-used, and unusable) ISO639-2 three-letter tags to old (and still in use) ISO639-1 two-letter equivalents -- like "ara" -> "ar" -- because "ara" has I<never> been in use as an Internet language tag, and RFC 3066 stipulates that it never should be, since a shorter tag ("ar") exists. Examples: alternate_language_tags('no-bok') is ('nb') alternate_language_tags('nb') is ('no-bok') alternate_language_tags('he') is ('iw') alternate_language_tags('iw') is ('he') alternate_language_tags('i-hakka') is ('zh-hakka', 'x-hakka') alternate_language_tags('zh-hakka') is ('i-hakka', 'x-hakka') alternate_language_tags('en') is () alternate_language_tags('x-mingo-tom') is ('i-mingo-tom') alternate_language_tags('x-klikitat') is ('i-klikitat') alternate_language_tags('i-klikitat') is ('x-klikitat') This function returns empty-list if given anything other than a formally valid language tag. =cut my %alt = qw( i x x i I X X I ); sub alternate_language_tags { my $tag = $_[0]; return() unless &is_language_tag($tag); my @em; # push 'em real goood! # For the moment, these legacy variances are few enough that # we can just handle them here with regexps. if( $tag =~ m/^[ix]-hakka\b(.*)/i) {push @em, "zh-hakka$1"; } elsif($tag =~ m/^zh-hakka\b(.*)/i) { push @em, "x-hakka$1", "i-hakka$1"; } elsif($tag =~ m/^he\b(.*)/i) { push @em, "iw$1"; } elsif($tag =~ m/^iw\b(.*)/i) { push @em, "he$1"; } elsif($tag =~ m/^in\b(.*)/i) { push @em, "id$1"; } elsif($tag =~ m/^id\b(.*)/i) { push @em, "in$1"; } elsif($tag =~ m/^[ix]-lux\b(.*)/i) { push @em, "lb$1"; } elsif($tag =~ m/^lb\b(.*)/i) { push @em, "i-lux$1", "x-lux$1"; } elsif($tag =~ m/^[ix]-navajo\b(.*)/i) { push @em, "nv$1"; } elsif($tag =~ m/^nv\b(.*)/i) { push @em, "i-navajo$1", "x-navajo$1"; } elsif($tag =~ m/^yi\b(.*)/i) { push @em, "ji$1"; } elsif($tag =~ m/^ji\b(.*)/i) { push @em, "yi$1"; } elsif($tag =~ m/^nb\b(.*)/i) { push @em, "no-bok$1"; } elsif($tag =~ m/^no-bok\b(.*)/i) { push @em, "nb$1"; } elsif($tag =~ m/^nn\b(.*)/i) { push @em, "no-nyn$1"; } elsif($tag =~ m/^no-nyn\b(.*)/i) { push @em, "nn$1"; } push @em, $alt{$1} . $2 if $tag =~ /^([XIxi])(-.+)/; return @em; } ########################################################################### { # Init %Panic... my @panic = ( # MUST all be lowercase! # Only large ("national") languages make it in this list. # If you, as a user, are so bizarre that the /only/ language # you claim to accept is Galician, then no, we won't do you # the favor of providing Catalan as a panic-fallback for # you. Because if I start trying to add "little languages" in # here, I'll just go crazy. # Scandinavian lgs. All based on opinion and hearsay. 'sv' => [qw(nb no da nn)], 'da' => [qw(nb no sv nn)], # I guess [qw(no nn nb)], [qw(no nn nb sv da)], 'is' => [qw(da sv no nb nn)], 'fo' => [qw(da is no nb nn sv)], # I guess # I think this is about the extent of tolerable intelligibility # among large modern Romance languages. 'pt' => [qw(es ca it fr)], # Portuguese, Spanish, Catalan, Italian, French 'ca' => [qw(es pt it fr)], 'es' => [qw(ca it fr pt)], 'it' => [qw(es fr ca pt)], 'fr' => [qw(es it ca pt)], # Also assume that speakers of the main Indian languages prefer # to read/hear Hindi over English [qw( as bn gu kn ks kok ml mni mr ne or pa sa sd te ta ur )] => 'hi', # Assamese, Bengali, Gujarati, [Hindi,] Kannada (Kanarese), Kashmiri, # Konkani, Malayalam, Meithei (Manipuri), Marathi, Nepali, Oriya, # Punjabi, Sanskrit, Sindhi, Telugu, Tamil, and Urdu. 'hi' => [qw(bn pa as or)], # I welcome finer data for the other Indian languages. # E.g., what should Oriya's list be, besides just Hindi? # And the panic languages for English is, of course, nil! # My guesses at Slavic intelligibility: ([qw(ru be uk)]) x 2, # Russian, Belarusian, Ukranian 'sr' => 'hr', 'hr' => 'sr', # Serb + Croat 'cs' => 'sk', 'sk' => 'cs', # Czech + Slovak 'ms' => 'id', 'id' => 'ms', # Malay + Indonesian 'et' => 'fi', 'fi' => 'et', # Estonian + Finnish #?? 'lo' => 'th', 'th' => 'lo', # Lao + Thai ); my($k,$v); while(@panic) { ($k,$v) = splice(@panic,0,2); foreach my $k (ref($k) ? @$k : $k) { foreach my $v (ref($v) ? @$v : $v) { push @{$Panic{$k} ||= []}, $v unless $k eq $v; } } } } =item * the function @langs = panic_languages(@accept_languages) This function takes a list of 0 or more language tags that constitute a given user's Accept-Language list, and returns a list of tags for I<other> (non-super) languages that are probably acceptable to the user, to be used I<if all else fails>. For example, if a user accepts only 'ca' (Catalan) and 'es' (Spanish), and the documents/interfaces you have available are just in German, Italian, and Chinese, then the user will most likely want the Italian one (and not the Chinese or German one!), instead of getting nothing. So C<panic_languages('ca', 'es')> returns a list containing 'it' (Italian). English ('en') is I<always> in the return list, but whether it's at the very end or not depends on the input languages. This function works by consulting an internal table that stipulates what common languages are "close" to each other. A useful construct you might consider using is: @fallbacks = super_languages(@accept_languages); push @fallbacks, panic_languages( @accept_languages, @fallbacks, ); =cut sub panic_languages { # When in panic or in doubt, run in circles, scream, and shout! my(@out, %seen); foreach my $t (@_) { next unless $t; next if $seen{$t}++; # so we don't return it or hit it again # push @out, super_languages($t); # nah, keep that separate push @out, @{ $Panic{lc $t} || next }; } return grep !$seen{$_}++, @out, 'en'; } #--------------------------------------------------------------------------- #--------------------------------------------------------------------------- =item * the function implicate_supers( ...languages... ) This takes a list of strings (which are presumed to be language-tags; strings that aren't, are ignored); and after each one, this function inserts super-ordinate forms that don't already appear in the list. The original list, plus these insertions, is returned. In other words, it takes this: pt-br de-DE en-US fr pt-br-janeiro and returns this: pt-br pt de-DE de en-US en fr pt-br-janeiro This function is most useful in the idiom implicate_supers( I18N::LangTags::Detect::detect() ); (See L<I18N::LangTags::Detect>.) =item * the function implicate_supers_strictly( ...languages... ) This works like C<implicate_supers> except that the implicated forms are added to the end of the return list. In other words, implicate_supers_strictly takes a list of strings (which are presumed to be language-tags; strings that aren't, are ignored) and after the whole given list, it inserts the super-ordinate forms of all given tags, minus any tags that already appear in the input list. In other words, it takes this: pt-br de-DE en-US fr pt-br-janeiro and returns this: pt-br de-DE en-US fr pt-br-janeiro pt de en The reason this function has "_strictly" in its name is that when you're processing an Accept-Language list according to the RFCs, if you interpret the RFCs quite strictly, then you would use implicate_supers_strictly, but for normal use (i.e., common-sense use, as far as I'm concerned) you'd use implicate_supers. =cut sub implicate_supers { my @languages = grep is_language_tag($_), @_; my %seen_encoded; foreach my $lang (@languages) { $seen_encoded{ I18N::LangTags::encode_language_tag($lang) } = 1 } my(@output_languages); foreach my $lang (@languages) { push @output_languages, $lang; foreach my $s ( I18N::LangTags::super_languages($lang) ) { # Note that super_languages returns the longest first. last if $seen_encoded{ I18N::LangTags::encode_language_tag($s) }; push @output_languages, $s; } } return uniq( @output_languages ); } sub implicate_supers_strictly { my @tags = grep is_language_tag($_), @_; return uniq( @_, map super_languages($_), @_ ); } ########################################################################### 1; __END__ =back =head1 ABOUT LOWERCASING I've considered making all the above functions that output language tags return all those tags strictly in lowercase. Having all your language tags in lowercase does make some things easier. But you might as well just lowercase as you like, or call C<encode_language_tag($lang1)> where appropriate. =head1 ABOUT UNICODE PLAINTEXT LANGUAGE TAGS In some future version of I18N::LangTags, I plan to include support for RFC2482-style language tags -- which are basically just normal language tags with their ASCII characters shifted into Plane 14. =head1 SEE ALSO * L<I18N::LangTags::List|I18N::LangTags::List> * RFC 3066, C<http://www.ietf.org/rfc/rfc3066.txt>, "Tags for the Identification of Languages". (Obsoletes RFC 1766) * RFC 2277, C<http://www.ietf.org/rfc/rfc2277.txt>, "IETF Policy on Character Sets and Languages". * RFC 2231, C<http://www.ietf.org/rfc/rfc2231.txt>, "MIME Parameter Value and Encoded Word Extensions: Character Sets, Languages, and Continuations". * RFC 2482, C<http://www.ietf.org/rfc/rfc2482.txt>, "Language Tagging in Unicode Plain Text". * Locale::Codes, in C<http://www.perl.com/CPAN/modules/by-module/Locale/> * ISO 639-2, "Codes for the representation of names of languages", including two-letter and three-letter codes, C<http://www.loc.gov/standards/iso639-2/php/code_list.php> * The IANA list of registered languages (hopefully up-to-date), C<http://www.iana.org/assignments/language-tags> =head1 COPYRIGHT Copyright (c) 1998+ Sean M. Burke. All rights reserved. This library is free software; you can redistribute it and/or modify it under the same terms as Perl itself. The programs and documentation in this dist are distributed in the hope that they will be useful, but without any warranty; without even the implied warranty of merchantability or fitness for a particular purpose. =head1 AUTHOR Sean M. Burke C<sburke@cpan.org> =cut Collate.pm 0000644 00000012416 14711200716 0006470 0 ustar 00 package I18N::Collate; use strict; our $VERSION = '1.02'; =head1 NAME I18N::Collate - compare 8-bit scalar data according to the current locale =head1 SYNOPSIS use I18N::Collate; setlocale(LC_COLLATE, 'locale-of-your-choice'); $s1 = I18N::Collate->new("scalar_data_1"); $s2 = I18N::Collate->new("scalar_data_2"); =head1 DESCRIPTION *** WARNING: starting from the Perl version 5.003_06 the I18N::Collate interface for comparing 8-bit scalar data according to the current locale HAS BEEN DEPRECATED That is, please do not use it anymore for any new applications and please migrate the old applications away from it because its functionality was integrated into the Perl core language in the release 5.003_06. See the perllocale manual page for further information. *** This module provides you with objects that will collate according to your national character set, provided that the POSIX setlocale() function is supported on your system. You can compare $s1 and $s2 above with $s1 le $s2 to extract the data itself, you'll need a dereference: $$s1 This module uses POSIX::setlocale(). The basic collation conversion is done by strxfrm() which terminates at NUL characters being a decent C routine. collate_xfrm() handles embedded NUL characters gracefully. The available locales depend on your operating system; try whether C<locale -a> shows them or man pages for "locale" or "nlsinfo" or the direct approach C<ls /usr/lib/nls/loc> or C<ls /usr/lib/nls> or C<ls /usr/lib/locale>. Not all the locales that your vendor supports are necessarily installed: please consult your operating system's documentation and possibly your local system administration. The locale names are probably something like C<xx_XX.(ISO)?8859-N> or C<xx_XX.(ISO)?8859N>, for example C<fr_CH.ISO8859-1> is the Swiss (CH) variant of French (fr), ISO Latin (8859) 1 (-1) which is the Western European character set. =cut # I18N::Collate.pm # # Author: Jarkko Hietaniemi <F<jhi@iki.fi>> # Helsinki University of Technology, Finland # # Acks: Guy Decoux <F<decoux@moulon.inra.fr>> understood # overloading magic much deeper than I and told # how to cut the size of this code by more than half. # (my first version did overload all of lt gt eq le ge cmp) # # Purpose: compare 8-bit scalar data according to the current locale # # Requirements: Perl5 POSIX::setlocale() and POSIX::strxfrm() # # Exports: setlocale 1) # collate_xfrm 2) # # Overloads: cmp # 3) # # Usage: use I18N::Collate; # setlocale(LC_COLLATE, 'locale-of-your-choice'); # 4) # $s1 = I18N::Collate->("scalar_data_1"); # $s2 = I18N::Collate->("scalar_data_2"); # # now you can compare $s1 and $s2: $s1 le $s2 # to extract the data itself, you need to deref: $$s1 # # Notes: # 1) this uses POSIX::setlocale # 2) the basic collation conversion is done by strxfrm() which # terminates at NUL characters being a decent C routine. # collate_xfrm handles embedded NUL characters gracefully. # 3) due to cmp and overload magic, lt le eq ge gt work also # 4) the available locales depend on your operating system; # try whether "locale -a" shows them or man pages for # "locale" or "nlsinfo" work or the more direct # approach "ls /usr/lib/nls/loc" or "ls /usr/lib/nls". # Not all the locales that your vendor supports # are necessarily installed: please consult your # operating system's documentation. # The locale names are probably something like # 'xx_XX.(ISO)?8859-N' or 'xx_XX.(ISO)?8859N', # for example 'fr_CH.ISO8859-1' is the Swiss (CH) # variant of French (fr), ISO Latin (8859) 1 (-1) # which is the Western European character set. # # Updated: 19961005 # # --- use POSIX qw(strxfrm LC_COLLATE); use warnings::register; require Exporter; our @ISA = qw(Exporter); our @EXPORT = qw(collate_xfrm setlocale LC_COLLATE); our @EXPORT_OK = qw(); use overload qw( fallback 1 cmp collate_cmp ); our($LOCALE, $C); our $please_use_I18N_Collate_even_if_deprecated = 0; sub new { my $new = $_[1]; if (warnings::enabled() && $] >= 5.003_06) { unless ($please_use_I18N_Collate_even_if_deprecated) { warnings::warn <<___EOD___; *** WARNING: starting from the Perl version 5.003_06 the I18N::Collate interface for comparing 8-bit scalar data according to the current locale HAS BEEN DEPRECATED That is, please do not use it anymore for any new applications and please migrate the old applications away from it because its functionality was integrated into the Perl core language in the release 5.003_06. See the perllocale manual page for further information. *** ___EOD___ $please_use_I18N_Collate_even_if_deprecated++; } } bless \$new; } sub setlocale { my ($category, $locale) = @_[0,1]; POSIX::setlocale($category, $locale) if (defined $category); # the current $LOCALE $LOCALE = $locale || $ENV{'LC_COLLATE'} || $ENV{'LC_ALL'} || ''; } sub C { my $s = ${$_[0]}; $C->{$LOCALE}->{$s} = collate_xfrm($s) unless (defined $C->{$LOCALE}->{$s}); # cache when met $C->{$LOCALE}->{$s}; } sub collate_xfrm { my $s = $_[0]; my $x = ''; for (split(/(\000+)/, $s)) { $x .= (/^\000/) ? $_ : strxfrm("$_\000"); } $x; } sub collate_cmp { &C($_[0]) cmp &C($_[1]); } # init $LOCALE &I18N::Collate::setlocale(); 1; # keep require happy
| ver. 1.4 |
Github
|
.
| PHP 7.4.33 | Генерация страницы: 0 |
proxy
|
phpinfo
|
Настройка