【JavaScript】Unicode character class escapeを使う【正規表現】
こんにちは、フリーランスエンジニアの太田雅昭です。
プロローグ
漢字とかひらがなとか、正規表現で抜き出したいなぁ。。。Unicodeテーブルから範囲を抜き出そうかな。。。そや、ChatGPTに聞いてみよう!
GPT「は?すでにあるで」
なにー
Unicode character class escape
MDNで定義されています。
"aA".match(/\p{UppercaseLetter}/u) // "A"
といったように使います。
ただとにかく分かりにくいです。まとめられている所も見つかりません。ですので、今回まとめてみます。
ライブラリにしてみた
今回のコードは、npmで公開しました。
https://www.npmjs.com/package/@masa-dev/unicode-escapes
必要なバリューを、定数として定義しています。使用するには、new RegExpで動的に生成する必要があります。
以下、コードです。
Properties
General Category
General Categoryは以下です。
// https://unicode.org/reports/tr18/#General_Category_Property
/** General Category */
export enum GC {
Letter = '\\p{L}',
UppercaseLetter = '\\p{Lu}',
LowercaseLetter = '\\p{Ll}',
TitlecaseLetter = '\\p{Lt}',
ModifierLetter = '\\p{Lm}',
OtherLetter = '\\p{Lo}',
Mark = '\\p{M}',
NonSpacingMark = '\\p{Mn}',
SpacingCombiningMark = '\\p{Mc}',
EnclosingMark = '\\p{Me}',
Number = '\\p{N}',
DecimalDigitNumber = '\\p{Nd}',
LetterNumber = '\\p{Nl}',
OtherNumber = '\\p{No}',
Symbol = '\\p{S}',
MathSymbol = '\\p{Sm}',
CurrencySymbol = '\\p{Sc}',
ModifierSymbol = '\\p{Sk}',
OtherSymbol = '\\p{So}',
Punctuation = '\\p{P}',
ConnectorPunctuation = '\\p{Pc}',
DashPunctuation = '\\p{Pd}',
OpenPunctuation = '\\p{Ps}',
ClosePunctuation = '\\p{Pe}',
InitialPunctuation = '\\p{Pi}',
FinalPunctuation = '\\p{Pf}',
OtherPunctuation = '\\p{Po}',
Separator = '\\p{Z}',
SpaceSeparator = '\\p{Zs}',
LineSeparator = '\\p{Zl}',
ParagraphSeparator = '\\p{Zp}',
Other = '\\p{C}',
Control = '\\p{Cc}',
Format = '\\p{Cf}',
Surrogate = '\\p{Cs}',
PrivateUse = '\\p{Co}',
Unassigned = '\\p{Cn}',
}
Script Property
Scriptは以下です。
/** Script Property */
export enum Script {
Adlam = '\\p{Script=Adlam}',
CaucasianAlbanian = '\\p{Script=Caucasian_Albanian}',
Ahom = '\\p{Script=Ahom}',
Arabic = '\\p{Script=Arabic}',
ImperialAramaic = '\\p{Script=Imperial_Aramaic}',
Armenian = '\\p{Script=Armenian}',
Avestan = '\\p{Script=Avestan}',
Balinese = '\\p{Script=Balinese}',
Bamum = '\\p{Script=Bamum}',
BassaVah = '\\p{Script=Bassa_Vah}',
Batak = '\\p{Script=Batak}',
Bengali = '\\p{Script=Bengali}',
Bhaiksuki = '\\p{Script=Bhaiksuki}',
Bopomofo = '\\p{Script=Bopomofo}',
Brahmi = '\\p{Script=Brahmi}',
Braille = '\\p{Script=Braille}',
Buginese = '\\p{Script=Buginese}',
Buhid = '\\p{Script=Buhid}',
Chakma = '\\p{Script=Chakma}',
CanadianAboriginal = '\\p{Script=Canadian_Aboriginal}',
Carian = '\\p{Script=Carian}',
Cham = '\\p{Script=Cham}',
Cherokee = '\\p{Script=Cherokee}',
Chorasmian = '\\p{Script=Chorasmian}',
Coptic = '\\p{Script=Coptic}',
CyproMinoan = '\\p{Script=Cypro_Minoan}',
Cypriot = '\\p{Script=Cypriot}',
Cyrillic = '\\p{Script=Cyrillic}',
Devanagari = '\\p{Script=Devanagari}',
DivesAkuru = '\\p{Script=Dives_Akuru}',
Dogra = '\\p{Script=Dogra}',
Deseret = '\\p{Script=Deseret}',
Duployan = '\\p{Script=Duployan}',
EgyptianHieroglyphs = '\\p{Script=Egyptian_Hieroglyphs}',
Elbasan = '\\p{Script=Elbasan}',
Elymaic = '\\p{Script=Elymaic}',
Ethiopic = '\\p{Script=Ethiopic}',
Georgian = '\\p{Script=Georgian}',
Glagolitic = '\\p{Script=Glagolitic}',
GunjalaGondi = '\\p{Script=Gunjala_Gondi}',
MasaramGondi = '\\p{Script=Masaram_Gondi}',
Gothic = '\\p{Script=Gothic}',
Grantha = '\\p{Script=Grantha}',
Greek = '\\p{Script=Greek}',
Gujarati = '\\p{Script=Gujarati}',
Gurmukhi = '\\p{Script=Gurmukhi}',
Hangul = '\\p{Script=Hangul}',
Han = '\\p{Script=Han}',
Hanunoo = '\\p{Script=Hanunoo}',
Hatran = '\\p{Script=Hatran}',
Hebrew = '\\p{Script=Hebrew}',
Hiragana = '\\p{Script=Hiragana}',
AnatolianHieroglyphs = '\\p{Script=Anatolian_Hieroglyphs}',
PahawhHmong = '\\p{Script=Pahawh_Hmong}',
NyiakengPuachue_Hmong = '\\p{Script=Nyiakeng_Puachue_Hmong}',
KatakanaOr_Hiragana = '\\p{Script=Katakana_Or_Hiragana}',
OldHungarian = '\\p{Script=Old_Hungarian}',
OldItalic = '\\p{Script=Old_Italic}',
Javanese = '\\p{Script=Javanese}',
KayahLi = '\\p{Script=Kayah_Li}',
Katakana = '\\p{Script=Katakana}',
Kawi = '\\p{Script=Kawi}',
Kharoshthi = '\\p{Script=Kharoshthi}',
Khmer = '\\p{Script=Khmer}',
Khojki = '\\p{Script=Khojki}',
KhitanSmall_Script = '\\p{Script=Khitan_Small_Script}',
Kannada = '\\p{Script=Kannada}',
Kaithi = '\\p{Script=Kaithi}',
TaiTham = '\\p{Script=Tai_Tham}',
Lao = '\\p{Script=Lao}',
Latin = '\\p{Script=Latin}',
Lepcha = '\\p{Script=Lepcha}',
Limbu = '\\p{Script=Limbu}',
LinearA = '\\p{Script=Linear_A}',
LinearB = '\\p{Script=Linear_B}',
Lisu = '\\p{Script=Lisu}',
Lycian = '\\p{Script=Lycian}',
Lydian = '\\p{Script=Lydian}',
Mahajani = '\\p{Script=Mahajani}',
Makasar = '\\p{Script=Makasar}',
Mandaic = '\\p{Script=Mandaic}',
Manichaean = '\\p{Script=Manichaean}',
Marchen = '\\p{Script=Marchen}',
Medefaidrin = '\\p{Script=Medefaidrin}',
MendeKikakui = '\\p{Script=Mende_Kikakui}',
MeroiticCursive = '\\p{Script=Meroitic_Cursive}',
MeroiticHieroglyphs = '\\p{Script=Meroitic_Hieroglyphs}',
Malayalam = '\\p{Script=Malayalam}',
Modi = '\\p{Script=Modi}',
Mongolian = '\\p{Script=Mongolian}',
Mro = '\\p{Script=Mro}',
MeeteiMayek = '\\p{Script=Meetei_Mayek}',
Multani = '\\p{Script=Multani}',
Myanmar = '\\p{Script=Myanmar}',
NagMundari = '\\p{Script=Nag_Mundari}',
Nandinagari = '\\p{Script=Nandinagari}',
OldNorth_Arabian = '\\p{Script=Old_North_Arabian}',
Nabataean = '\\p{Script=Nabataean}',
Newa = '\\p{Script=Newa}',
Nko = '\\p{Script=Nko}',
Nushu = '\\p{Script=Nushu}',
Ogham = '\\p{Script=Ogham}',
OlChiki = '\\p{Script=Ol_Chiki}',
OldTurkic = '\\p{Script=Old_Turkic}',
Oriya = '\\p{Script=Oriya}',
Osage = '\\p{Script=Osage}',
Osmanya = '\\p{Script=Osmanya}',
OldUyghur = '\\p{Script=Old_Uyghur}',
Palmyrene = '\\p{Script=Palmyrene}',
PauCin_Hau = '\\p{Script=Pau_Cin_Hau}',
OldPermic = '\\p{Script=Old_Permic}',
PhagsPa = '\\p{Script=Phags_Pa}',
InscriptionalPahlavi = '\\p{Script=Inscriptional_Pahlavi}',
PsalterPahlavi = '\\p{Script=Psalter_Pahlavi}',
Phoenician = '\\p{Script=Phoenician}',
Miao = '\\p{Script=Miao}',
InscriptionalParthian = '\\p{Script=Inscriptional_Parthian}',
Rejang = '\\p{Script=Rejang}',
HanifiRohingya = '\\p{Script=Hanifi_Rohingya}',
Runic = '\\p{Script=Runic}',
Samaritan = '\\p{Script=Samaritan}',
OldSouth_Arabian = '\\p{Script=Old_South_Arabian}',
Saurashtra = '\\p{Script=Saurashtra}',
SignWriting = '\\p{Script=SignWriting}',
Shavian = '\\p{Script=Shavian}',
Sharada = '\\p{Script=Sharada}',
Siddham = '\\p{Script=Siddham}',
Khudawadi = '\\p{Script=Khudawadi}',
Sinhala = '\\p{Script=Sinhala}',
Sogdian = '\\p{Script=Sogdian}',
OldSogdian = '\\p{Script=Old_Sogdian}',
SoraSompeng = '\\p{Script=Sora_Sompeng}',
Soyombo = '\\p{Script=Soyombo}',
Sundanese = '\\p{Script=Sundanese}',
SylotiNagri = '\\p{Script=Syloti_Nagri}',
Syriac = '\\p{Script=Syriac}',
Tagbanwa = '\\p{Script=Tagbanwa}',
Takri = '\\p{Script=Takri}',
TaiLe = '\\p{Script=Tai_Le}',
NewTai_Lue = '\\p{Script=New_Tai_Lue}',
Tamil = '\\p{Script=Tamil}',
Tangut = '\\p{Script=Tangut}',
TaiViet = '\\p{Script=Tai_Viet}',
Telugu = '\\p{Script=Telugu}',
Tifinagh = '\\p{Script=Tifinagh}',
Tagalog = '\\p{Script=Tagalog}',
Thaana = '\\p{Script=Thaana}',
Thai = '\\p{Script=Thai}',
Tibetan = '\\p{Script=Tibetan}',
Tirhuta = '\\p{Script=Tirhuta}',
Tangsa = '\\p{Script=Tangsa}',
Toto = '\\p{Script=Toto}',
Ugaritic = '\\p{Script=Ugaritic}',
Vai = '\\p{Script=Vai}',
Vithkuqi = '\\p{Script=Vithkuqi}',
WarangCiti = '\\p{Script=Warang_Citi}',
Wancho = '\\p{Script=Wancho}',
OldPersian = '\\p{Script=Old_Persian}',
Cuneiform = '\\p{Script=Cuneiform}',
Yezidi = '\\p{Script=Yezidi}',
Yi = '\\p{Script=Yi}',
ZanabazarSquare = '\\p{Script=Zanabazar_Square}',
Inherited = '\\p{Script=Inherited}',
Common = '\\p{Script=Common}',
Unknown = '\\p{Script=Unknown}',
}
Binary Unicode
Binaryも使えます
// https://tc39.es/ecma262/multipage/text-processing.html#sec-runtime-semantics-unicodematchproperty-p
/** Binary Unicode Property */
export enum BU {
ASCII = '\\p{ASCII}',
ASCIIHexDigit = '\\p{ASCII_Hex_Digit}',
Alphabetic = '\\p{Alphabetic}',
Any = '\\p{Any}',
Assigned = '\\p{Assigned}',
BidiControl = '\\p{Bidi_Control}',
BidiMirrored = '\\p{Bidi_Mirrored}',
CaseIgnorable = '\\p{Case_Ignorable}',
Cased = '\\p{Cased}',
ChangesWhenCasefolded = '\\p{Changes_When_Casefolded}',
ChangesWhenCasemapped = '\\p{Changes_When_Casemapped}',
ChangesWhenLowercased = '\\p{Changes_When_Lowercased}',
ChangesWhenNFKCCasefolded = '\\p{Changes_When_NFKC_Casefolded}',
ChangesWhenTitlecased = '\\p{Changes_When_Titlecased}',
ChangesWhenUppercased = '\\p{Changes_When_Uppercased}',
Dash = '\\p{Dash}',
DefaultIgnorableCodePoint = '\\p{Default_Ignorable_Code_Point}',
Deprecated = '\\p{Deprecated}',
Diacritic = '\\p{Diacritic}',
Emoji = '\\p{Emoji}',
EmojiComponent = '\\p{Emoji_Component}',
EmojiModifier = '\\p{Emoji_Modifier}',
EmojiModifierBase = '\\p{Emoji_Modifier_Base}',
EmojiPresentation = '\\p{Emoji_Presentation}',
ExtendedPictographic = '\\p{Extended_Pictographic}',
Extender = '\\p{Extender}',
GraphemeBase = '\\p{Grapheme_Base}',
GraphemeExtend = '\\p{Grapheme_Extend}',
HexDigit = '\\p{Hex_Digit}',
IDSBinaryOperator = '\\p{IDS_Binary_Operator}',
IDSTrinaryOperator = '\\p{IDS_Trinary_Operator}',
IDContinue = '\\p{ID_Continue}',
IDStart = '\\p{ID_Start}',
Ideographic = '\\p{Ideographic}',
JoinControl = '\\p{Join_Control}',
LogicalOrderException = '\\p{Logical_Order_Exception}',
Lowercase = '\\p{Lowercase}',
Math = '\\p{Math}',
NoncharacterCodePoint = '\\p{Noncharacter_Code_Point}',
PatternSyntax = '\\p{Pattern_Syntax}',
PatternWhiteSpace = '\\p{Pattern_White_Space}',
QuotationMark = '\\p{Quotation_Mark}',
Radical = '\\p{Radical}',
RegionalIndicator = '\\p{Regional_Indicator}',
SentenceTerminal = '\\p{Sentence_Terminal}',
SoftDotted = '\\p{Soft_Dotted}',
TerminalPunctuation = '\\p{Terminal_Punctuation}',
UnifiedIdeograph = '\\p{Unified_Ideograph}',
Uppercase = '\\p{Uppercase}',
VariationSelector = '\\p{Variation_Selector}',
WhiteSpace = '\\p{White_Space}',
XIDContinue = '\\p{XID_Continue}',
XIDStart = '\\p{XID_Start}',
BasicEmoji = '\\p{Basic_Emoji}',
EmojiKeycapSequence = '\\p{Emoji_Keycap_Sequence}',
RGIEmojiModifierSequence = '\\p{RGI_Emoji_Modifier_Sequence}',
RGIEmojiFlagSequence = '\\p{RGI_Emoji_Flag_Sequence}',
RGIEmojiTagSequence = '\\p{RGI_Emoji_Tag_Sequence}',
RGIEmojiZWJSequence = '\\p{RGI_Emoji_ZWJ_Sequence}',
RGIEmoji = '\\p{RGI_Emoji}',
}