#' Code stateIDs from text
#' @description
#'   This function allows for contemporary and historical countries or states
#'   to be identified in text.
#'   It uses a regular expression (regex) to search for
#'   a number of common names and alternative spellings for each entity.
#'   The function returns either the three-letter abbreviation
#'   (an extended version of ISO-3166 alpha-3), or the name of the state.
#'   The function can also return multiple matches, where more than one
#'   country is mentioned in the text.
#'   Currently, the function can identify `r nrow(code_states())` entities.
#'   Updates, bug reports, and suggestions welcome.
#'   
#' @param text A vector of text to search for country names within.
#' @param code Logical whether the function should return the three-letter
#'   abbreviation (an extended version of ISO-3166 alpha-3), 
#'   or the name of the state.
#'   For the complete list of entities and their search terms, 
#'   run the function without an argument (i.e. `code_states()`).
#'   Updates and suggestions welcome.
#' @param max_count Integer how many countries to search for in each element
#'   of the vector.
#'   Where more than one country is matched, the countries are returned as a set,
#'   i.e. in the format "\{AUS,NZL\}".
#'   By default `max_count = 1`, which will just return the first match.
#' @returns A character vector of the same length as `text`,
#'  with either the three-letter abbreviation (an extended version of ISO-3166 alpha-3),
#'  or the name of the state, or `NA` where no match was found.
#'  If `max_count > 1`, multiple matches are returned as a set,
#'  i.e. in the format "\{AUS,NZL\}".
#'  If the function is run without an argument, it returns
#'  a data frame with the complete list of entities and their search terms.
#' @importFrom stringi stri_trans_general stri_detect_regex
#' @importFrom knitr kable
#' @examples
#' code_states(c("I went to England",
#'   "I come from Venezuela",
#'   "Did you know there was a Lunda Empire?",
#'   "I like both Australia and New Zealand"))
#' code_states(c("I went to England",
#'   "I come from Venezuela",
#'   "Did you know there was a Lunda Empire?",
#'   "I like both Australia and New Zealand"), max_count = 2)
#' @importFrom purrr map_chr
#' @export
code_states <- function(text, code = TRUE, max_count = 1){
  
  if(missing(text)) return(countryRegex)
  
  text <- stringi::stri_trans_general(text, "latin-ascii")
  
  purrr::map_chr(text, function(x) {
    out <- as.data.frame(countryRegex)[
                      which(stringi::stri_detect_regex(x, 
                                                       unlist(countryRegex[, 3]),
                                                 max_count = max_count,
                                                 opts_regex = list(case_insensitive = TRUE))),
                      ifelse(code, 1, 2)]
    if(length(out)==0) NA_character_ else 
      if(max_count > 1 && length(out)>1) 
        paste0("{",paste(out, collapse = ","),"}") else out
  })
}

# Regex ####
countryRegex <- dplyr::tribble(
  ~stateID, ~Label, ~Regex,
  ## A ####
  "ABJ","Abuja","abuja|suleja",
  "ABK","Abkhazia","abkhaz",
  "ABW","Aruba","aruba",
  "ACH","Aceh","aceh",
  "ADA","Adamawa","adamawa|fombina|gongola",
  "AFG","Afghanistan","afghan|afghanistan",
  "AGO","Angola","angola",
  "AHB","Anhalt-Bernburg","anhalt-bernburg",
  "AHK","Anhalt-Kothen","anhalt-kothen|anhalt-cothen|anhalt-coethen|anhalt-kthen",
  "AHN","Asahan","asahan|assahan",
  "AIA","Anguilla","anguilla",
  "ALA","Aland Islands","aland.is|iles.aland",
  "ALB","Albania","albania|albanie",
  "ALT","Los Altos","los altos",
  "AND","Andorra","andorra|andorre",
  "ANH","Anhalt-Dessau","anhalt-dessau",
  "ANJ","Anjouan","anjouan|ndzuwani",
  "ANK","Ankole","ankole|nkore",
  "ARE","United Arab Emirates","emirates|u[[:punct:]]a[[:punct:]]e[[:punct:]]|uae|united.?arab.?em|emirats arabes unis",
  "ARG","Argentina","argentin|argentine",
  "ARO","Aro Confederacy","\\baro\\b|arochukwu",
  "ARM","Armenia","armenia|armenie",
  "ASA","Assam","assam",
  "ASI","Asir","asir",
  "ASM","American Samoa","american.samoa|samoa.americain",
  "AST","Ashanti","ashanti",
  "ATA","Antarctica","antarctica|antarctique",
  "ATF","French Southern Territories","french.southern.territories|terres.australes.francaises|french.southern.and.antarctic.territories",
  "ATG","Antigua and Barbuda","antigua|barbuda|antigua-et-barbuda",
  "AUH","Austria-Hungary","austria-hungary|austro-hungaria|aust-hung|aust empire|autria empire|austrian empire",
  "AUS","Australia","australia|australie|christmas is|cocos.*is|heard.is|mcdonald.is",
  "AUT","Austria","austria(?!-hungary| hungary)|austri.*emp|austrian|autriche",
  "AWS","Aussa","\\baussa\\b|\\bawsa\\b",
  "AZA","Azande","azande|azanda|zande|bakuba|bushongo",
  "AZE","Azerbaijan","azerbaijan|azerbaidjan|azerbeijan",
  ## B ####
  "BAD","Baden","baden",
  "BAU","Bau","\\bbau\\b",
  "BAV","Bavaria","bavaria",
  "BDG","Badung","badung|badong|bandanapura",
  "BDI","Burundi","burundi",
  "BEL","Belgium","(?!.*luxem).*belgium|belgian|flemish|belgique|walloon|brussels|bruxelles",
  "BEN","Benin","benin|dahome|dahomey|benin",
  "BES","Bonaire Sint Eustatius and Saba","bonaire|sint.eustatius|saba",
  "BFA","Burkina Faso","burkina|faso|upper.?volta|burkina ?faso|burkna faso",
  "BGD","Bangladesh","bangladesh|(?=.*east).*paki?stan|bangladesh",
  "BGI","Bagirmi","bag(i|ui)rmi|bakarmi",
  "BGM","Begemder","begemder",
  "BGR","Bulgaria","bulgaria|bulgarie",
  "BGU","Borgu","\\bborgu\\b|\\bborgawa\\b|\\bbariba\\b",
  "BHP","Bhopal","bhopal",
  "BHR","Bahrain","bahr.?in|bahre.?n|bahrein",
  "BHS","Bahamas","bahamas",
  "BHT","Bharatpur","bharatpur",
  "BIH","Bosnia and Herzegovina","herzegovina|bosnia|bosnie-herzegovine|herzego|bosnie",
  "BIK","Bikaner","bikaner",
  "BLI","Bangli","bangli|tanah.bali",
  "BLM","Saint Bathelemy","saint.barthelemy",
  "BLR","Belarus","belarus|byelo|bielorussie|belarus",
  "BLZ","Belize","belize|(?=.*british).*honduras|belize",
  "BMN","Bamum","bamum|bamoun",
  "BMU","Bermuda","bermuda",
  "BNJ","Benjermassin","benjermassin",
  "BOH","Bohol","bohol",
  "BOL","Plurinational State of Bolivia","(?!-).bolivia|bolivie|^bolivia$|bolivia..pluri",
  "BON","Bone","bone",
  "BOU","Burgundy","burgundy|bourgogne|burgandy",
  "BPR","Bahawalpur","bahawalpur",
  "BRA","Brazil","brazil|brasil|bresil|bresil",
  "BRB","Barbados","barbados|barbade",
  "BRE","Bremen","bremen",
  "BRK","Brakna","brakna",
  "BRN","Brunei Darussalam","brun.?i|brunei darussalam",
  "BTN","Bhutan","bhutan|bhoutan",
  "BUE","Buleleng","buleleng|baliling",
  "BUK","Bukhara","bukhara",
  "BUL","Bulungan","bulungan",
  "BUN","Bunyoro","bunyoro",
  "BUR","Urundi","\\burundi",
  "BVT","Bouvet Island","bouvet.is|ile.bouvet",
  "BWA","Botswana","botswana|bechuana|botswana",
  ## C ####
  "CAF","Central African Republic","central.african.republic|cen.* afr.* re.*|republique centrafricaine|centrafrique|centrafricaine|cen afr emp",
  "CAN","Canada","canada|canadian|canada|newfoundland",
  "CAY","Cayor","cayor|Kajoor",
  "CCK","Cocos Islands","cocos.is|iles.cocos|keeling",
  "CHA","Champasak","champasak|champassak",
  "CHE","Switzerland","switz|swiss|suisse",
  "CHI","Chien","chien|khouang|xiang|xieng",
  "CHL","Chile","chile|chili",
  "CHM","Chamba","chamba",
  "CHN","China","(?!.*\\bmac)(?!.*\\bhong)(?!.*\\btai)(?!.*\\brep).*china|(?=.*peo)(?=.*rep).*china|chine\\b|macao|china",
  "CHT","Chitral","chitral|kator",
  "CIV","Cote d'Ivoire","ivoire|ivory|cote d'ivo.+",
  "CMR","Cameroon","cameroon|cameroun|cameroun",
  "COD","Democratic Republic of the Congo","dem.*congo|congo(.+)?dem|d[[:punct:]]r[[:punct:]]c[[:punct:]]|belgian.?congo|congo.?free.?state|kinshasa|zaire|l.opoldville|drc| droc |rdc|droc|congo-kinshasa|congo dr|d.r. of the congo|congo.+kin|dr.congo",
  "COG","Congo","(?<!democratic )rep.*congo|brazz|congo.rep|congo..rep|congo..bra|^congo$",
  "COK","Cook Islands","cook.is|iles.cook",
  "COL","Colombia","(?<!great |gran )colombia|colombie",
  "COM","Comoros","comoro|comores",
  "CPV","Cape Verde","verde|cap-vert|cabo verde|cap vert",
  "CRI","Costa Rica","costa.?rica|costa rica",
  "CUB","Cuba","\\bcuba|cuba",
  "CUT","Cutch","cutch|Kutch|Kachchh",
  "CUW","Curacao","curacao",
  "CXR","Christmas Island","christmas.is|ile.christmas",
  "CYM","Cayman Islands","cayman.is|iles.cayman",
  "CYP","Cyprus","cyprus|chypre",
  "CZE","Czechia","(?=.*rep).*czech|czechia|bohemia|czechoslovakia|tchequie|czech|tchequ.+",
  ## D ####
  "DAG","Dagestan","dagestan|shamyl",
  "DDR","German Democratic Republic","german.dem.*rep|democratic.?rep.*germany|east.germany|ddr|german.dr",
  "DEU","Germany","(?<!east )germany|german(?!.*democratic) republic|prussia|allemagne|german.fr|german.fed|alsace-lorraine|brunswick|berlin",
  "DFR","Darfur","darfur",
  "DHR","Dhar","dhar\\b|puars",
  "DIR","Dir","\\bdir\\b",
  "DJA","Fouta Djallon","f(ou|u|uu)ta.djallon",
  "DJI","Djibouti","djibouti",
  "DLI","Deli Sultanate","\\bdeli\\b",
  "DMA","Dominica","dominica\\b|dominique",
  "DNK","Denmark","denmar|danish(?!.*waters)|danemark|greenland",
  "DOM","Dominican Republic","domin.*rep|republique dominicaine|santo domin",
  "DRV","Annam","annam",
  "DZA","Algeria","ottoman.?algeria|algerie|(?<!ottoman )algeria",
  "DZG","Free City of Danzig","danzig",
  ## E ####
  "EAZ","Zanzibar","zanzibar",
  "ECU","Ecuador","ecuador|equateur",
  "EGB","Egba","egba|abeokuta",
  "EGY","Egypt","egypt|egypte|united arab republic|(?<![:alpha:])uar(?![:alpha:])",
  "EHT","State of Haiti","state of haiti",
  "ERI","Eritrea","eritrea|erythr[ee]e",
  "ESH","Western Sahara","western.sahara|sahrawi|sahara occidental",
  "ESP","Spain","spain|castile|spanish(?! guinea)|espagne|canary|granada",
  "EST","Estonia","estonia|estonie",
  "ETH","Ethiopia","ethiopia|abyssinia|ethiopie|eth.pia",
  "ETS","Eastern Turkistan","eastern.turkistan|eastern.turkestan",
  "EUE","European Union","european union|\\beu\\b|\\be[[:punct:]]u[[:punct:]]\\b|european.union|union.euro|european.comm|european.economic.comm",
  ## F ####
  "FDU","Fuladu","fu(l|ll)adu|fulbe|firdu",
  "FIN","Finland","finland|finlande|\\baland\\b",
  "FJI","Fiji","fiji|fidji",
  "FLK","Falkland Islands","falkland.is|iles.falkland|malvinas",
  "FNJ","Funj","funj",
  "FRA","France","(?!.*\\bdep)(?!.*martinique).*france|french.?republic|gaul|france|french|guadeloupe|burgundy|mayotte|new.caledonia|martinique|r[ee]union|barthelemy|t.martin|st\\..martin|miquelon|futuna",
  "FRK","Frankfurt","frankfurt\\b|frankfort\\b",
  "FRO","Faroe Islands","faroe|faeroe|iles feroe",
  "FSM","Federated States of Micronesia","micronesia|micron[ee]sie",
  "FTO","Fouta Toro","f(ou|u|uu)ta.toro",
  ## G ####
  "GAB","Gabon","gabon",
  "GBR","United Kingdom","england|united.?kingdom|britain|british(?!.*hondur| east africa)|\\buk\\b|\\bu[[:punct:]]k[[:punct:]]\\b|royaume-uni|bermuda|anguilla|cayman|channel.is|akrotiri|dhekelia|falkland.is|virgin.is|gibraltar|guernsey|guernesey|jersey|isle of man|northern.ireland|pitcairn|helena|norfolk|montserrat|south.sandwich.is|caicos|grande.bretagne|the uk",
  "GCL","Gran Colombia","gran colombia|great colombia",
  "GEA","Gera","gera\\b",
  "GEL","Geledi Sultanate","geledi|gobroon",
  "GEO","Georgia","(?<!south )georgia|georgie|georgie",
  "GGY","Guernsey","guernsey|guernesey",
  "GHA","Ghana","ghana|gold.?coast|ghana",
  "GIA","Gianyar","gian(j|y)(a|e|ee)r",
  "GIB","Gibraltar","gibraltar",
  "GIN","Guinea","(?<!new |spanish |portuguese |equatorial |papua new )guinea(?!.*equ|-bissau| bissau)|(?<!nouvelle.)guinee(?!-bissa| equatoriale)",
  "GJJ","Gojjam","gojjam",
  "GLP","Guadeloupe","guadeloupe",
  "GMA","Gumma","\\bgumma\\b|\\bguma\\b",
  "GMB","Gambia","gambia|gambie",
  "GNB","Guinea-Bissau","bissau|(?=.*portu).*guinea|guin[ee]e.bissa[uo]",
  "GNJ","Gonja","gonja",
  "GNQ","Equatorial Guinea","guine.*eq|eq.*guine|(?=.*span).*guinea|guin[ee]e [ee]quatoriale",
  "GOB","Gobir-Tibiri","gobir.tibiri",
  "GOM","Gomma","\\bgomma\\b",
  "GOW","Gowa","\\bgowa\\b",
  "GRC","Greece","greece|hellenic|hellas|greek|gr[e]ce",
  "GRD","Grenada","grenada|grenade",
  "GRL","Greenland","greenland|groenland",
  "GTM","Guatemala","guatemala",
  "GUF","French Guiana","french.guiana|guyane.francaise",
  "GUM","Guam","guam",
  "GUY","Guyana","guyana|british.?guiana|guyana",
  "GWA","Gwalior","gwalior",
  ## H ####
  "HAN","Hanover","hanover",
  "HAR","Harrar","harrar|harar",
  "HAW","Hawaii","hawaii",
  "HBG","Hamburg","hamburg",
  "HEJ","Hejaz","hejaz",
  "HER","Herat","herat|heraut",
  "HHB","Hesse-Homburg","hesse.homburg",
  "HKG","Hong Kong","hong.kong|r.a.s. chinoise de hong kong",
  "HMD","Heard and McDonald Islands","heard.is|ile.heard|mcdonald.is|ile.mcdonald",
  "HND","Honduras","(?<!british )honduras|honduras",
  "HOH","Hohenzollern-Hechingen","hohenzollern.hechingen",
  "HOS","Hohenzollern-Sigmaringen","hohenzollern.sigmaringen",
  "HRB","Hiraab Emirate","hiraab|hirab",
  "HRV","Croatia","croatia|croatie",
  "HSE","Hesse Electoral","hesse.*lectoral|hesse.kassel|hesseelect",
  "HSG","Hesse Grand Ducal","hess.*gran.*ducal|hesse.darmstadt|hessegrand",
  "HTI","Haiti","(?<!state of )haiti|hayti",
  "HUN","Hungary","(?<!austria-|austria )hungary|hungarian|hongrie",
  "HUZ","Hunza","hunza",
  "HYO","Hobyo","hobyo|obbia",
  ## I ####
  "IBA","Ibadan","ibadan",
  "IBU","Ijebu","\\bijebu",
  "IDN","Indonesia","indonesia|indonesie",
  "IFE","Ile Ife","\\bile.ife\\b|\\bife\\b",
  "IGA","Igala","igala|egarra",
  "IJA","Ijaye","ijaye|kurumi",
  "IJE","Ijesa","ijesa|ekiti",
  "ILN","Ilorin","ilorin",
  "IMN","Isle of Man","isle.of.man",
  "IND","India","india(?!.*ocea)|\\binde\\b",
  "INO","Indore","indore",
  "IOT","British Indian Ocean Territory","indian.ocean.territory|diego.garcia",
  "IPH","Perak","perak",
  "IRL","Ireland","(?=.*(?<!northern )ireland)(?=.*(?<!britain and )ireland)|irish|irlande",
  "IRN","Islamic Republic of Iran","\\biran|persia|iran",
  "IRQ","Iraq","iraq|mesopotamia|irak",
  "ISL","Iceland","iceland|islande",
  "ISR","Israel","israel",
  "ITA","Italy","italy|italian|italo|italie",
  ## J ####
  "JAM","Jamaica","jama.?ca|jama[i]que",
  "JBI","Jambi","jambi\\b",
  "JBS","Jabal Shammar","jabal.shammar|hajl.shammar",
  "JEM","Jembrana","jembrana|djembrana|djambangrana|djambarana",
  "JEY","Jersey","jersey",
  "JLF","Jolof","jolof|djoloff|wolof",
  "JMK","Jimma-Kakka","jimma-kakka",
  "JOD","Jodhpur","jodhpur",
  "JOH","Johore","johor",
  "JOR","Jordan","jordan|jordanie",
  "JPN","Japan","japan|nippon|japon",
  "JPR","Jaipur","jaipur",
  "JSM","Jaisalmer","jaisalm(e|i)r",
  ## K ####
  "KAF","Kaffa","kaffa",
  "KAS","Kasanje","kasanje",
  "KAT","Kathiri Sultanate","kathiri sultanate",
  "KAZ","Kazakhstan","kazak|kazakhstan",
  "KBU","Kaabu","kaabu|\\bgabu\\b|ngabou|\\bngabu",
  "KED","Kedah","kedah",
  "KEL","Kelantan","kelantan",
  "KEN","Kenya","kenya|british.?east.?africa|east.?africa.?prot|kenya",
  "KET","Ketu","\\bketu\\b|\\bketou\\b",
  "KGZ","Kyrgyzstan","kyrgyz|kirghiz|kirghizistan",
  "KHI","Khairpur","kh(ai|ay|y|ye)rp(u|oo)r|khaipur",
  "KHM","Cambodia","cambodia|kampuchea|khmer|cambodge",
  "KHV","Khiva","khiva",
  "KIR","Kiribati","kiribati",
  "KLT","Kalat","kalat",
  "KLU","Klungkung","klungkung|kalong|kloengkoeng|kalungkung|klonkong",
  "KNA","St. Kitts and Nevis","kitts|nevis|saint-christophe.et.nieves|st.kitt",
  "KNB","Kanem-Bornu","kanem-bornu",
  "KNG","Kong","\\bkong.empire|dyula|ouattara|kong.wattara",
  "KOK","Kokand","kokand",
  "KON","Kongo Kingdom","kongo kingdom",
  "KOR","Republic of Korea","^korea$|(?<!peoples )republic.of.korea|coree.*sud|republique.de.coree|south.korea|korea.{1,2}rep|korea.{1,2}rok|korea.{1,2}south",
  "KOS","Kosovo","kosovo",
  "KOT","Kotah","kotah",
  "KPT","Kapurthala","kapurthala",
  "KRG","Karangasem (Bali and Lombok)","karangasem|bali.and.lombok",
  "KSG","Kishangarh","kishangarh|kishngarh",
  "KTA","Kaarta","kaarta",
  "KUB","Kuba","kuba",
  "KUN","Kunduz","(k|q)(u|oo)nd(u|oo)z|kataghan|badakhshan",
  "KUT","Kutai","kut(a|e)i|coti",
  "KWE","Karagwe","kara(g|q)we",
  "KWT","Kuwait","kuwait|koweit",
  "KZB","Kazembe","kazembe",
  ## L ####
  "LAO","Lao People's Democratic Republic","laos|lao.pdr|lao.people|lao, p.*d.*r|lao.*p\\.d\\.r",
  "LBN","Lebanon","lebanon|liban",
  "LBR","Liberia","liberia|liberia",
  "LBY","Libya","libya|tripolitania|libye",
  "LCA","St. Lucia","lucia|sainte-lucie",
  "LHJ","Lahej","lah(j|ej)\\b",
  "LIE","Liechtenstein","liechtenstein|liechtstein",
  "LIM","Limmu","enarya|limmu|ennarea",
  "LIP","Lippe","lippe",
  "LKA","Sri Lanka","sri.?lanka|ceylon|sri lanka",
  "LNG","Loango","loango|vili",
  "LOM","Lombok","(?<!bali.and.)lombok|mataram",
  "LPB","Luang Phrabang","luang.phrabang",
  "LSO","Lesotho","lesotho|basuto.*",
  "LTU","Lithuania","lithuania|lituanie|lithuanie",
  "LBA","Luba","luba",
  "LBK","Lubeck","l(u|ue)beck",
  "LUC","Lucca","lucca",
  "LUN","Lunda","\\blunda\\b",
  "LUX","Luxembourg","(?<!belgian )luxem|luxembourg",
  "LVA","Latvia","latvia|lettonie",
  "LWU","Luwu","luwu|ware",
  ## M ####
  "MAC","Macao","\\bmaca(o|u)\\b",
  "MAF","French St. Martin","s(aint|t|t.).martin",
  "MAR","Morocco","morocco|\\bmaroc|maroc",
  "MAS","Massina","massina|maasina|macina",
  "MCO","Monaco","monaco",
  "MDA","Republic of Moldova","moldov|moldavia|b(a|e)ssarabia|moldavie",
  "MDG","Madagascar","madagascar|malagasy|madagasacar",
  "MDK","Mandinka Empire","mandinka\\b|wa(s|ss)(a|u)l(u|ou)",
  "MDR","Mandara","mandara|wandala",
  "MDV","Maldives","maldive|maldives",
  "MEC","Mecklenburg Schwerin","mecklenbur.*schwerin|mecklenbur",
  "MEX","Mexico","mexic|mexique",
  "MGB","Mangbetu","mangbe(t|tt)u",
  "MGU","Maguindanao","maguindanao",
  "MGW","Mengwi","m(a|e)ng(w|o|u|uw)i",
  "MHL","Marshall Islands","marshall|iles marshall",
  "MIN","Minangkabau","minangkabau",
  "MKD","North Macedonia","macedonia|fyrom|former yugoslav republic of mac.*|mac[ee]doine",
  "MLI","Mali","\\bmali",
  "MLT","Malta","malta|malte",
  "MMR","Myanmar","myanmar|burma|myanmar (birmanie)",
  "MNB","Menabe","menabe|sakalava",
  "MNE","Montenegro","(?!.*serbia).*montenegro|mont[ee]n[ee]gro",
  "MNG","Mongolia","mongol|mongolie",
  "MNP","Northern Mariana Islands","northern.mariana.is|iles.mariana.du.nord",
  "MOD","Modena","modena",
  "MOH","Moheli","moheli|mwali",
  "MOS","Mossi","yatenga|mossi",
  "MOZ","Mozambique","mozambique",
  "MPR","Manipur","manipur",
  "MRD","Maradi","maradi|katsina",
  "MRT","Mauritania","mauritania|mauritanie",
  "MSR","Montserrat","montserrat",
  "MTA","Mataatua","mataatua",
  "MTJ","Majeerteen","maj(e|ee)rt(ee|ey|ai)n|migiurtinia",
  "MTQ","Martinique","martinique",
  "MTW","Mthetwa","m(t|th)e(t|th)wa",
  "MUS","Mauritius","mauritius|maurice",
  "MWI","Malawi","malawi|nyasa|malawi",
  "MYS","Malaysia","malaysia|malaisie|malay",
  "MYT","Mayotte","mayotte",
  ## N ####
  "NAG","Nagpur","nagpur",
  "NAM","Namibia","namibia|namibie",
  "NAS","Nassau","nassau",
  "NCL","New Caledonia","new.caledonia|nouvelle.caledonie",
  "NDB","Ndebele","(nde|mata)bele",
  "NDW","Ndwandwe","ndwandwe",
  "NER","Niger","niger(?!ia)",
  "NFK","Norfolk Island","norfolk.is|ile.norfolk",
  "NGA","Nigeria","nigeria",
  "NGS","Negeri Sembilan","negeri.sembilan|sri.menanti",
  "NGT","Ngati Toa","ngati.toa",
  "NIC","Nicaragua","nicaragua",
  "NIU","Niue","niue|nioue",
  "NLD","Netherlands","(?!.*ant)(?!.*carib).*netherlands|netherlands.antil|dutch.antil|aruba|curacao|pays-bas|bonaire|holland|(?!.*martin)(?!.*saba).*maarten|saint-martin (partie neerlandaise)",
  "NOR","Norway","norway|norweg|norvege|bouvet|svalbard",
  "NPH","Ngapuhi","ngapuhi",
  "NPL","Nepal","nepal",
  "NRU","Nauru","nauru",
  "NUP","Nupe","nupe",
  "NYA","Nyungu","nyungu\\b",
  "NZL","New Zealand","new.?zealand|nouvelle.zelande",
  ## O ####
  "OFS","Orange Free State","orange free state|ovs",
  "OGD","Oaugadougou","(oua|oau|wa)gad(ou|u)g(ou|u)",
  "OLD","Oldenburg","oldenburg",
  "OMN","Oman","\\boman|trucial|omaan",
  "OVB","Ovimbundu","ovimbundu",
  "OWO","Owo","\\bowo\\b",
  "OYO","Oyo","oyo",
  ## P ####
  "PAH","Pahang","pahang",
  "PAK","Pakistan","(?<!east |eastern )paki?stan|pakistan",
  "PAL","Palembang","palembang",
  "PAN","Panama","panama",
  "PCN","Pitcairn","pitcairn",
  "PER","Peru","peru|p[ee]rou",
  "PES","Peshwa","peshwa|poona",
  "PHL","Philippines","philippines",
  "PLW","Palau","palau|palaos",
  "PMA","Parma","parma",
  "PNG","Papua New Guinea","papua|new.?guinea|papouasie",
  "POL","Poland","poland|polish|pologne",
  "PPR","Polish Peoples Republic","polish peoples republic",
  "PRI","Puerto Rico","puerto.rico",
  "PRK","Democratic People's Republic of Korea","dprk|d.p.r.k|korea.+(d.p.r|dpr|north|dem.*peo.*rep)|(d.p.r|dpr|dem.*peo.*rep.*).+korea|north korea|cor[e]e.*nord|korea.dem|korea..dem|korea..peo|korea.pr|korea..pre-1950",
  "PRT","Portugal","portugal|portuguese|portugal|azores|acores",
  "PRY","Paraguay","paraguay(?! river)|paraguay",
  "PSE","Palestine","palestin|\\bgaza|west.?bank|territoires palestiniens",
  "PUN","Punjab","punjab|panj.?b|panj-.?b",
  "PYF","French Polynesia","french.polynesia|polynesie.francaise",
  ## Q ####
  "QAT","Qatar","qatar",
  "QUA","Qu'aiti Sultanate","qu'aiti|quaiti",
  ## R ####
  "RAB","Rabih","rabih",
  "REG","Reuss-Greiz","\\breuss.greiz\\b",
  "REU","Reunion","\\breunion\\b",
  "RIA","Riau","riau",
  "ROU","Romania","r(o|u|ou)mania|roumanie",
  "ROZ","Rozvi","roz(vi|we)|mutapa|changamire",
  "RSS","Reuss-Schleiz","\\breuss.schleiz\\b",
  "RUS","Russian Federation","\\brussia|russian fed.*|soviet.?union|union of soviet|u[[:punct:]]s[[:punct:]]s[[:punct:]]r[[:punct:]]|socialist.?republics|USSR|RSFSR|russie|un.*sov.*soc.*rep|moscow",
  "RVN","Republic of Vietnam","(?<!socialist|democratic) republic.of.viet.?nam|republic.of.viet.?nam|viet.?nam rep|south.viet.?nam|viet.?nam.south|svietnam|vietnam, s|vietnam, r",
  "RWA","Rwanda","rwanda|rwandese|ruanda",
  ## S ####
  "SAL","Saloum","saloum",
  "SAR","Sardinia","sardinia|Sardinia",
  "SAU","Saudi Arabia","sa.*arabia|first saudi state|emirate of dir.?iyah|arabie saoudite|najd",
  "SAW","Sawantvadi","sawantvadi|sawantwadi",
  "SAX","Saxony","saxony",
  "SCR","Schwarzburg-Rudolstadt","schwarzburg.rudolstadt",
  "SCS","Schwarzburg-Sondershausen","schwarzburg.sondershausen",
  "SDN","Sudan","(?<!south )sudan|soudan|funj sultanate of sennar",
  "SEG","Segou","segou|s.?gou",
  "SEL","Selangor","selangor",
  "SEN","Senegal","s[ee]n[ee]gal",
  "SGP","Singapore","singapore|singapour",
  "SGS","South Georgia and the South Sandwich Islands","south.georgia|south.sandwich.is|iles.sandwich.du.sud",
  "SHI","Shilluk","shilluk|(c|ch)ollo\\b",
  "SHN","St. Helena","saint-helena|st.helena|tristan.da.cunha|ascension",
  "SHO","Shoa","shoa|shewa|shua|showa|shuwa",
  "SIA","Siak","siak|siunia|syunik",
  "SIC","Two Sicilies","two.?sicilies|2.?sicilies",
  "SID","Sind","sind|sindh",
  "SIK","Sikkim","sikkim",
  "SIR","Sirohi","sirohi",
  "SIS","Sise","\\bsise\\b|moriuledugu|madina\\b",
  "SJM","Svalbard and Jan Mayen","svalbard|jan.mayen|iles.svalbard.et.jan.mayen",
  "SKO","Sikasso","sikasso",
  "SLB","Solomon Islands","solomon|salomon",
  "SLE","Sierra Leone","sierra.leo",
  "SLV","El Salvador","el.?salvador|salvador",
  "SMP","Sampthar","samp(t|th)ar",
  "SMR","San Marino","san.?marino|saint-marin",
  "SNB","Singhbum","sin(g|gh)b(hu|oo)m",
  "SOK","Sokoto","sokoto",
  "SOM","Somalia","somali|somalie",
  "SOT","South Ossetia","south ossetia",
  "SPM","Saint Pierre and Miquelon","s(aint|t|t.).pierre|miquelon",
  "SRB","Serbia","(?!.*monte).*serbia|serbie|yugoslav",
  "SSD","South Sudan","south.sudan|soudan du sud",
  "STP","Sao Tome and Principe","s[a]o.tom[ee]|sao tome-et-principe|saotome",
  "SUL","Sulu","\\bsulu\\b",
  "SUR","Suriname","surinam|dutch.?guiana|suriname",
  "SVK","Slovakia","(?<!czecho)slovak|slovaquie|slovaque",
  "SVN","Slovenia","sloven|slov[ee]nie",
  "SWA","Swat","^swat$|dera yusufzai",
  "SWE","Sweden","swed|su[e]de",
  "SWK","Sarawak","sarawak",
  "SWZ","Eswatini","swazi|eswatini",
  "SXA","Saxe-Altenburg","saxe-atenburg|saxe-altenburg",
  "SXG","Saxe-Coburg-Gotha","saxe-coburg-gotha",
  "SXH","Saxe-Meiningen-Hidburghausen","saxe.meiningen.hidburghausen",
  "SXM","Dutch Sint Maarten","sint.maarten",
  "SXW","Saxe-Weimar-Eisenach","saxe.wiemar.eisenach|saxe-weimar-eisenach",
  "SYC","Seychelles","seychell|seychelles",
  "SYR","Syrian Arab Republic","syria|syrie",
  ## T ####
  "TAB","Tabanan","\\btabanan\\b",
  "TAH","Tahiti","\\btahiti\\b|pomare",
  "TAM","Tampin","\\btampin\\b",
  "TBT","Tibet","tibet",
  "TCA","Turks and Caicos Islands","turks.and.caicos|iles.turques.et.caicos",
  "TCD","Chad","chad|tchad",
  "TEG","Tegali","t(eg|ak|aq)ali",
  "TEK","Teke","\\bteke\\b|ma(c|k)o(c|k)o|an(s|z)iku",
  "TER","Terengganu","terengganu|trengganu|tringganu",
  "TEX","Texas","texas",
  "TGO","Togo","togo",
  "THA","Thailand","thailand|siam|tha[i]lande",
  "TIG","Tigray","t(i|e)gr(ay|e)",
  "TJK","Tajikistan","tajik|tadjikistan",
  "TKL","Tokelau","tokelau",
  "TKM","Turkmenistan","turkmen|turkmenistan|turkmenistan",
  "TLS","Timor-Leste","(?=.*leste).*timor|(?=.*east).*timor|timor oriental|timor.?leste",
  "TNG","Tenkodogo","tenkodogo",
  "TNK","Tonk","\\btonk\\b",
  "TOK","Tokolor","tokolor",
  "TON","Tonga","tonga",
  "TOR","Toro","(?<!f(ou|u|uu)ta.)(but|t)oro\\b",
  "TRA","Transvaal","transvaal",
  "TRZ","Trarza","trarza",
  "TTO","Trinidad and Tobago","trinidad|tobag|trinite",
  "TUN","Tunisia","tunisia|tunisie",
  "TUR","Turkiye","turkey|turkiye|turquie|ottoman.emp",
  "TUS","Tuscany","tuscany",
  "TUV","Tuvalu","tu[vl]a[lv]u",
  "TWN","Chinese Taipei","ta[i]wan|taipei|formosa",
  "TZA","United Republic of Tanzania","tanzania|tanzanie|tanganyika",
  ## U ####
  "UGA","Uganda","uganda|buganda|ouganda",
  "UKR","Ukraine","ukrain|ukraine|ukrainian ssr|ukrainian soviet",
  "UMI","U.S. Minor Outlying Islands","u[[:punct:]]s[[:punct:]].minor.outlying.isl",
  "UNY","Unyanyembe","unyanyembe",
  "UPC","United Provinces of Central America","united provinces of central america|United Provinces of CA|United Province of CA|United Province CA|federal.provinces.of.central.america",
  "UPR","Udaipur","udaipur|mewar",
  "URA","Urambo","\\burambo\\b",
  "URY","Uruguay","uruguay",
  "USA","United States of America","united states of america|united.states(?!.*of brazil)|u[[:punct:]]s[[:punct:]]a[[:punct:]]|u[[:punct:]]s[[:punct:]](?!.*s[[:punct:]]r[[:punct:]])|\\busa\\b|\\bus\\b|etats-unis|northern.mariana|puerto.rico|guam|(?<![:alpha:])usa(?![:alpha:])",
  "UZB","Uzbekistan","uzbek|ouzb[ee]kistan",
  ## V ####
  "VAT","Holy See","holy.?see|vatican|papal.?st|etat de la cite du vatican|saint.siege",
  "VCT","St. Vincent and the Grenadines","vincent|saint-vincent-et-les-grenadines|st.vin",
  "VEN","Bolivarian Republic of Venezuela","v[ee]n[ee]zu[ee]la",
  "VGB","British Virgin Islands","british.virgin.is|iles.virgin.britanniques",
  "VIR","U.S. Virgin Islands","u[[:punct:]]s[[:punct:]].virgin.is|iles.virgin.des.etats-unis",
  "VLS","Valais Republic","valais.rep",
  "VNM","Viet Nam","(?<=socialist|democratic|north).*viet.?nam|viet.?nam socialist|viet.?nam democratic|viet.?nam.?dem|dem.?rep.?viet.?nam|viet.?nam n|viet.?nam, n|viet.?nam, d|viet.?nam d|(?<!republic of )viet.?nam",
  "VTN","Vientiane","vientiane|viang.chan",
  "VUT","Vanuatu","vanuatu|new.?hebrides|vanuatu",
  ## W ####
  "WAD","Wadai","wadai",
  "WAG","Whanganui","w(ha|a)nganui",
  "WAI","Waikato","waikato",
  "WDK","Waldeck","waldeck",
  "WLF","Wallis and Futuna","wallis.and.futuna|futuna.et.wallis",
  "WLO","Walo","\\bw(a|aa)lo\\b",
  "WRT","Wuerttemburg","w(ue)rttemburg|w.rtemberg|wuerttemberg",
  "WSM","Samoa","(?!.*amer).*samoa|samoa",
  ## Y ####
  "YEK","Yeke Kingdom","yeke kingdom",
  "YEM","Yemen","(?<= arab|north|sana).*yemen|yemen(?= arab)|yemen(?!.peo)",
  "YPR","Yemen Peoples Republic","(?=.*peo).*yemen|(?!.*rep)(?=.*dem).*yemen|(?=.*south).*yemen|(?=.*\\bp\\.?d\\.?r).*yemen|\\baden",
  ## Z ####
  "ZAF","South Africa","south.africa|s\\. africa|afrique du sud|so.africa|venda|bophutatswana|transkei|ciskei",
  "ZIN","Zinder","zinder|damagaram",
  "ZMB","Zambia","zambia|northern.?rhodesia|zambie",
  "ZUL","Zululand","zululand",
  "ZWE","Zimbabwe","zimbabwe|(?<!northern )rhodesia|zimbabwe"#,
  ## Other ####
  # "EC","European Community","european economic community|european community|e[[:punct:]]c[[:punct:]]|(?<![:alpha:])eec(?![:alpha:])|european.comm|euratom|union.euro|eu-15",
  # "AU","African Union","african.union",
  # "EFTA","European Free Trade Association","(?<![:alpha:])efta(?![:alpha:])|european free trade association"
)

# Useful resource:
# https://stringi.gagolewski.com/weave/regular_expressions.html#
