Many wordlist updates

This commit is contained in:
Austen Adler 2023-04-30 22:16:54 -04:00
parent e8fc755083
commit 964ff0a043
7 changed files with 6398 additions and 6523 deletions

View File

@ -3199,7 +3199,6 @@ JADE
JAGUAR JAGUAR
JAIL JAIL
JAILHOUSE JAILHOUSE
JALAPEÑO
JAM JAM
JAR JAR
JASMINE JASMINE

File diff suppressed because it is too large Load Diff

View File

@ -1,4 +1,14 @@
WORD,ATTEMPTED_LEMMATIZATION,LEMMATIZER WORD,ATTEMPTED_LEMMATIZATION,LEMMATIZER
BELOVED,BELOVE,SPACY
BLEED,BLEE,SPACY
EMBED,EMBE,SPACY
EXCEED,EXCEE,SPACY
FORTHCOMING,FORTHCOME,SPACY
NAKED,NAKE,SPACY
NON-PROFIT,NON,SPACY
ONGOING,ONGOE,SPACY
RENOWNED,RENOWNE,SPACY
SKILLED,SKILLE,SPACY
ADVERTISING,ADVERTISE,SPACY ADVERTISING,ADVERTISE,SPACY
BOSS,BOS,WORDNET BOSS,BOS,WORDNET
COMICS,COMIC_STRIP,SPACY COMICS,COMIC_STRIP,SPACY
@ -24,15 +34,8 @@ LESS,LE,WORDNET
THEMSELVES,THEMSELVE,SPACY THEMSELVES,THEMSELVE,SPACY
PERHAPS,PERHAP,SPACY PERHAPS,PERHAP,SPACY
OURSELVES,OURSELVE,SPACY OURSELVES,OURSELVE,SPACY
EXCEED,EXCEE,SPACY
BLEED,BLEE,SPACY
NAKED,NAKE,SPACY
SKILLED,SKILLE,SPACY
BELOVED,BELOVE,SPACY
LEST,L,SPACY LEST,L,SPACY
WICKED,WICKE,SPACY WICKED,WICKE,SPACY
EMBED,EMBE,SPACY
ONGOING,ONGOE,SPACY
ASHAMED,ASHAME,SPACY ASHAMED,ASHAME,SPACY
CREED,CREE,SPACY CREED,CREE,SPACY
VIS,VI,WORDNET VIS,VI,WORDNET
@ -43,14 +46,11 @@ UNCHANGED,UNCHANGE,SPACY
UNPUBLISHED,UNPUBLISHE,SPACY UNPUBLISHED,UNPUBLISHE,SPACY
BIS,BI,WORDNET BIS,BI,WORDNET
UNEMPLOYED,UNEMPLOYE,SPACY UNEMPLOYED,UNEMPLOYE,SPACY
FORTHCOMING,FORTHCOME,SPACY
METAPHYSICS,METAPHYSIC,SPACY METAPHYSICS,METAPHYSIC,SPACY
UNAFFECTED,UNAFFECTE,SPACY UNAFFECTED,UNAFFECTE,SPACY
RENOWNED,RENOWNE,SPACY
TALENTED,TALENTE,SPACY TALENTED,TALENTE,SPACY
UNFINISHED,UNFINISHE,SPACY UNFINISHED,UNFINISHE,SPACY
MS,M,WORDNET MS,M,WORDNET
AESTHETICS,AESTHETIC,WORDNET
INFRARED,INFRARE,SPACY INFRARED,INFRARE,SPACY
DISINTERESTED,DISINTERESTE,SPACY DISINTERESTED,DISINTERESTE,SPACY
OS,O,WORDNET OS,O,WORDNET
@ -244,7 +244,6 @@ GASWORKS,GASWORK,SPACY
BULLETED,BULLETE,SPACY BULLETED,BULLETE,SPACY
ARTEL,ROTL,WORDNET ARTEL,ROTL,WORDNET
HEARTSTRINGS,HEARTSTRING,SPACY HEARTSTRINGS,HEARTSTRING,SPACY
INCREMENTING,INCREMENTE,SPACY
UNCLEARED,UNCLEARE,SPACY UNCLEARED,UNCLEARE,SPACY
CONSOLS,CONSOL,SPACY CONSOLS,CONSOL,SPACY
MUDFLATS,MUDFLAT,SPACY MUDFLATS,MUDFLAT,SPACY
@ -438,7 +437,6 @@ COSMONAUTICS,COSMONAUTIC,SPACY
WHOLEGRAINS,WHOLEGRAIN,SPACY WHOLEGRAINS,WHOLEGRAIN,SPACY
NEEDMENTS,NEEDMENT,SPACY NEEDMENTS,NEEDMENT,SPACY
ACHATES,ACHATE,SPACY ACHATES,ACHATE,SPACY
PRECOMPILING,PRECOMPILE,SPACY
BALUSTERED,BALUSTERE,SPACY BALUSTERED,BALUSTERE,SPACY
JUGGINS,JUGGIN,SPACY JUGGINS,JUGGIN,SPACY
UNCONFIGURED,UNCONFIGURE,SPACY UNCONFIGURED,UNCONFIGURE,SPACY
@ -475,7 +473,6 @@ REEDING,REEDE,SPACY
INTERCROSSING,INTERCROSSE,SPACY INTERCROSSING,INTERCROSSE,SPACY
UNDEDUCTED,UNDEDUCTE,SPACY UNDEDUCTED,UNDEDUCTE,SPACY
AGOGICS,AGOGIC,SPACY AGOGICS,AGOGIC,SPACY
UNATTENDING,UNATTENDE,SPACY
OVERMASTED,OVERMASTE,SPACY OVERMASTED,OVERMASTE,SPACY
GILES,GILE,SPACY GILES,GILE,SPACY
NONCOPYRIGHTED,NONCOPYRIGHTE,SPACY NONCOPYRIGHTED,NONCOPYRIGHTE,SPACY
@ -523,7 +520,6 @@ EUPHRATES,EUPHRATE,SPACY
TROWING,TROWE,SPACY TROWING,TROWE,SPACY
LACEUPS,LACEUP,SPACY LACEUPS,LACEUP,SPACY
ALIPED,ALIPE,SPACY ALIPED,ALIPE,SPACY
TALIPED,TALIPE,SPACY
RAMSES,RAMSE,SPACY RAMSES,RAMSE,SPACY
CENTRONICS,CENTRONIC,SPACY CENTRONICS,CENTRONIC,SPACY
MITHRAS,MITHRA,WORDNET MITHRAS,MITHRA,WORDNET

1 WORD ATTEMPTED_LEMMATIZATION LEMMATIZER
2 BELOVED BELOVE SPACY
3 BLEED BLEE SPACY
4 EMBED EMBE SPACY
5 EXCEED EXCEE SPACY
6 FORTHCOMING FORTHCOME SPACY
7 NAKED NAKE SPACY
8 NON-PROFIT NON SPACY
9 ONGOING ONGOE SPACY
10 RENOWNED RENOWNE SPACY
11 SKILLED SKILLE SPACY
12 ADVERTISING ADVERTISE SPACY
13 BOSS BOS WORDNET
14 COMICS COMIC_STRIP SPACY
34 THEMSELVES THEMSELVE SPACY
35 PERHAPS PERHAP SPACY
36 OURSELVES OURSELVE SPACY
EXCEED EXCEE SPACY
BLEED BLEE SPACY
NAKED NAKE SPACY
SKILLED SKILLE SPACY
BELOVED BELOVE SPACY
37 LEST L SPACY
38 WICKED WICKE SPACY
EMBED EMBE SPACY
ONGOING ONGOE SPACY
39 ASHAMED ASHAME SPACY
40 CREED CREE SPACY
41 VIS VI WORDNET
46 UNPUBLISHED UNPUBLISHE SPACY
47 BIS BI WORDNET
48 UNEMPLOYED UNEMPLOYE SPACY
FORTHCOMING FORTHCOME SPACY
49 METAPHYSICS METAPHYSIC SPACY
50 UNAFFECTED UNAFFECTE SPACY
RENOWNED RENOWNE SPACY
51 TALENTED TALENTE SPACY
52 UNFINISHED UNFINISHE SPACY
53 MS M WORDNET
AESTHETICS AESTHETIC WORDNET
54 INFRARED INFRARE SPACY
55 DISINTERESTED DISINTERESTE SPACY
56 OS O WORDNET
244 BULLETED BULLETE SPACY
245 ARTEL ROTL WORDNET
246 HEARTSTRINGS HEARTSTRING SPACY
INCREMENTING INCREMENTE SPACY
247 UNCLEARED UNCLEARE SPACY
248 CONSOLS CONSOL SPACY
249 MUDFLATS MUDFLAT SPACY
437 WHOLEGRAINS WHOLEGRAIN SPACY
438 NEEDMENTS NEEDMENT SPACY
439 ACHATES ACHATE SPACY
PRECOMPILING PRECOMPILE SPACY
440 BALUSTERED BALUSTERE SPACY
441 JUGGINS JUGGIN SPACY
442 UNCONFIGURED UNCONFIGURE SPACY
473 INTERCROSSING INTERCROSSE SPACY
474 UNDEDUCTED UNDEDUCTE SPACY
475 AGOGICS AGOGIC SPACY
UNATTENDING UNATTENDE SPACY
476 OVERMASTED OVERMASTE SPACY
477 GILES GILE SPACY
478 NONCOPYRIGHTED NONCOPYRIGHTE SPACY
520 TROWING TROWE SPACY
521 LACEUPS LACEUP SPACY
522 ALIPED ALIPE SPACY
TALIPED TALIPE SPACY
523 RAMSES RAMSE SPACY
524 CENTRONICS CENTRONIC SPACY
525 MITHRAS MITHRA WORDNET

Binary file not shown.

View File

@ -22,12 +22,20 @@ nlp = spacy.load("en_core_web_trf", disable=["parser", "ner"])
print("Loading initial wordlist") print("Loading initial wordlist")
words = [] words = []
with open("./00-dsiquintans-nounlist.txt", "r") as infile:
for line in infile: for file in [
words.append(line.split(",")[0].strip()) "./00-oxford-5000.txt",
with gzip.open("./00-frequency-list.csv.gz", "r") as infile: "./00-desiquintans-nounlist.txt",
for line in infile: "./00-frequency-list.csv.gz",
words.append(line.decode("ascii").split(",")[0]) ]:
if file.endswith(".gz"):
with gzip.open(file, "r") as infile:
for line in infile:
words.append(line.decode("ascii").split(",")[0])
else:
with open(file, "r") as infile:
for line in infile:
words.append(line.split(",")[0].strip())
# Remove header # Remove header
words = words[1:] words = words[1:]
@ -36,7 +44,8 @@ print(words[0:5])
print("Lemmatizing words") print("Lemmatizing words")
seen_lemmatizations = set() # seen_lemmatizations = set()
seen_words = set()
with open("./01-errored-lemmatized-words.csv", "w") as erroutfile: with open("./01-errored-lemmatized-words.csv", "w") as erroutfile:
erroutfile.write("WORD,ATTEMPTED_LEMMATIZATION,LEMMATIZER\n") erroutfile.write("WORD,ATTEMPTED_LEMMATIZATION,LEMMATIZER\n")
@ -44,27 +53,33 @@ with open("./01-errored-lemmatized-words.csv", "w") as erroutfile:
with gzip.open("./01-lemmatized-words.csv.gz", "w") as outfile: with gzip.open("./01-lemmatized-words.csv.gz", "w") as outfile:
outfile.write("WORD,LEMMATIZED_WORD,LEMMATIZER\n".encode("ascii")) outfile.write("WORD,LEMMATIZED_WORD,LEMMATIZER\n".encode("ascii"))
# Make a progress bar so logs can be printed
iter = tqdm(words) iter = tqdm(words)
for word in iter: # Zip (progress bar-ed) word list with nlp.pipe so nlp can process chunks at a time
for (word, spacy_word) in zip(iter, nlp.pipe(words)):
lemmatized_words = [ lemmatized_words = [
# Wordnet
(wordnet.lemmatize(word.lower()).upper(), "WORDNET"), (wordnet.lemmatize(word.lower()).upper(), "WORDNET"),
# Spacy (spacy_word[0].lemma_.upper(), "SPACY"),
(nlp(word)[0].lemma_.upper().upper(), "SPACY"),
] ]
for lemmatized_word, lemmatizer in lemmatized_words: for lemmatized_word, lemmatizer in lemmatized_words:
if word == lemmatized_word: # if word == lemmatized_word:
continue # # This word is its own lemmatization
# continue
if (word, lemmatized_word) in seen_lemmatizations: # Skip words if we've already lemmatized them
continue # if (word, lemmatized_word) in seen_lemmatizations: continue
# seen_lemmatizations.add((word, lemmatized_word))
seen_lemmatizations.add((word, lemmatized_word)) # Skip words if they've already been added
if lemmatized_word in seen_words:
iter.write(f"{lemmatized_word} ({lemmatizer})\talready in seen_words")
continue
seen_words.add(lemmatized_word)
if lemmatized_word not in words: if lemmatized_word not in words:
iter.write(f"{lemmatized_word} ({lemmatizer}) not in all_words") iter.write(f"{lemmatized_word} ({lemmatizer})\tnot in all_words")
erroutfile.write(f"{word},{lemmatized_word},{lemmatizer}\n") erroutfile.write(f"{word},{lemmatized_word},{lemmatizer}\n")
continue continue

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff