Many wordlist updates

This commit is contained in:
Austen Adler 2023-04-30 22:16:54 -04:00
parent e8fc755083
commit 964ff0a043
7 changed files with 6398 additions and 6523 deletions

View File

@ -3199,7 +3199,6 @@ JADE
JAGUAR
JAIL
JAILHOUSE
JALAPEÑO
JAM
JAR
JASMINE

File diff suppressed because it is too large Load Diff

View File

@ -1,4 +1,14 @@
WORD,ATTEMPTED_LEMMATIZATION,LEMMATIZER
BELOVED,BELOVE,SPACY
BLEED,BLEE,SPACY
EMBED,EMBE,SPACY
EXCEED,EXCEE,SPACY
FORTHCOMING,FORTHCOME,SPACY
NAKED,NAKE,SPACY
NON-PROFIT,NON,SPACY
ONGOING,ONGOE,SPACY
RENOWNED,RENOWNE,SPACY
SKILLED,SKILLE,SPACY
ADVERTISING,ADVERTISE,SPACY
BOSS,BOS,WORDNET
COMICS,COMIC_STRIP,SPACY
@ -24,15 +34,8 @@ LESS,LE,WORDNET
THEMSELVES,THEMSELVE,SPACY
PERHAPS,PERHAP,SPACY
OURSELVES,OURSELVE,SPACY
EXCEED,EXCEE,SPACY
BLEED,BLEE,SPACY
NAKED,NAKE,SPACY
SKILLED,SKILLE,SPACY
BELOVED,BELOVE,SPACY
LEST,L,SPACY
WICKED,WICKE,SPACY
EMBED,EMBE,SPACY
ONGOING,ONGOE,SPACY
ASHAMED,ASHAME,SPACY
CREED,CREE,SPACY
VIS,VI,WORDNET
@ -43,14 +46,11 @@ UNCHANGED,UNCHANGE,SPACY
UNPUBLISHED,UNPUBLISHE,SPACY
BIS,BI,WORDNET
UNEMPLOYED,UNEMPLOYE,SPACY
FORTHCOMING,FORTHCOME,SPACY
METAPHYSICS,METAPHYSIC,SPACY
UNAFFECTED,UNAFFECTE,SPACY
RENOWNED,RENOWNE,SPACY
TALENTED,TALENTE,SPACY
UNFINISHED,UNFINISHE,SPACY
MS,M,WORDNET
AESTHETICS,AESTHETIC,WORDNET
INFRARED,INFRARE,SPACY
DISINTERESTED,DISINTERESTE,SPACY
OS,O,WORDNET
@ -244,7 +244,6 @@ GASWORKS,GASWORK,SPACY
BULLETED,BULLETE,SPACY
ARTEL,ROTL,WORDNET
HEARTSTRINGS,HEARTSTRING,SPACY
INCREMENTING,INCREMENTE,SPACY
UNCLEARED,UNCLEARE,SPACY
CONSOLS,CONSOL,SPACY
MUDFLATS,MUDFLAT,SPACY
@ -438,7 +437,6 @@ COSMONAUTICS,COSMONAUTIC,SPACY
WHOLEGRAINS,WHOLEGRAIN,SPACY
NEEDMENTS,NEEDMENT,SPACY
ACHATES,ACHATE,SPACY
PRECOMPILING,PRECOMPILE,SPACY
BALUSTERED,BALUSTERE,SPACY
JUGGINS,JUGGIN,SPACY
UNCONFIGURED,UNCONFIGURE,SPACY
@ -475,7 +473,6 @@ REEDING,REEDE,SPACY
INTERCROSSING,INTERCROSSE,SPACY
UNDEDUCTED,UNDEDUCTE,SPACY
AGOGICS,AGOGIC,SPACY
UNATTENDING,UNATTENDE,SPACY
OVERMASTED,OVERMASTE,SPACY
GILES,GILE,SPACY
NONCOPYRIGHTED,NONCOPYRIGHTE,SPACY
@ -523,7 +520,6 @@ EUPHRATES,EUPHRATE,SPACY
TROWING,TROWE,SPACY
LACEUPS,LACEUP,SPACY
ALIPED,ALIPE,SPACY
TALIPED,TALIPE,SPACY
RAMSES,RAMSE,SPACY
CENTRONICS,CENTRONIC,SPACY
MITHRAS,MITHRA,WORDNET

1 WORD ATTEMPTED_LEMMATIZATION LEMMATIZER
2 BELOVED BELOVE SPACY
3 BLEED BLEE SPACY
4 EMBED EMBE SPACY
5 EXCEED EXCEE SPACY
6 FORTHCOMING FORTHCOME SPACY
7 NAKED NAKE SPACY
8 NON-PROFIT NON SPACY
9 ONGOING ONGOE SPACY
10 RENOWNED RENOWNE SPACY
11 SKILLED SKILLE SPACY
12 ADVERTISING ADVERTISE SPACY
13 BOSS BOS WORDNET
14 COMICS COMIC_STRIP SPACY
34 THEMSELVES THEMSELVE SPACY
35 PERHAPS PERHAP SPACY
36 OURSELVES OURSELVE SPACY
EXCEED EXCEE SPACY
BLEED BLEE SPACY
NAKED NAKE SPACY
SKILLED SKILLE SPACY
BELOVED BELOVE SPACY
37 LEST L SPACY
38 WICKED WICKE SPACY
EMBED EMBE SPACY
ONGOING ONGOE SPACY
39 ASHAMED ASHAME SPACY
40 CREED CREE SPACY
41 VIS VI WORDNET
46 UNPUBLISHED UNPUBLISHE SPACY
47 BIS BI WORDNET
48 UNEMPLOYED UNEMPLOYE SPACY
FORTHCOMING FORTHCOME SPACY
49 METAPHYSICS METAPHYSIC SPACY
50 UNAFFECTED UNAFFECTE SPACY
RENOWNED RENOWNE SPACY
51 TALENTED TALENTE SPACY
52 UNFINISHED UNFINISHE SPACY
53 MS M WORDNET
AESTHETICS AESTHETIC WORDNET
54 INFRARED INFRARE SPACY
55 DISINTERESTED DISINTERESTE SPACY
56 OS O WORDNET
244 BULLETED BULLETE SPACY
245 ARTEL ROTL WORDNET
246 HEARTSTRINGS HEARTSTRING SPACY
INCREMENTING INCREMENTE SPACY
247 UNCLEARED UNCLEARE SPACY
248 CONSOLS CONSOL SPACY
249 MUDFLATS MUDFLAT SPACY
437 WHOLEGRAINS WHOLEGRAIN SPACY
438 NEEDMENTS NEEDMENT SPACY
439 ACHATES ACHATE SPACY
PRECOMPILING PRECOMPILE SPACY
440 BALUSTERED BALUSTERE SPACY
441 JUGGINS JUGGIN SPACY
442 UNCONFIGURED UNCONFIGURE SPACY
473 INTERCROSSING INTERCROSSE SPACY
474 UNDEDUCTED UNDEDUCTE SPACY
475 AGOGICS AGOGIC SPACY
UNATTENDING UNATTENDE SPACY
476 OVERMASTED OVERMASTE SPACY
477 GILES GILE SPACY
478 NONCOPYRIGHTED NONCOPYRIGHTE SPACY
520 TROWING TROWE SPACY
521 LACEUPS LACEUP SPACY
522 ALIPED ALIPE SPACY
TALIPED TALIPE SPACY
523 RAMSES RAMSE SPACY
524 CENTRONICS CENTRONIC SPACY
525 MITHRAS MITHRA WORDNET

Binary file not shown.

View File

@ -22,12 +22,20 @@ nlp = spacy.load("en_core_web_trf", disable=["parser", "ner"])
print("Loading initial wordlist")
words = []
with open("./00-dsiquintans-nounlist.txt", "r") as infile:
for line in infile:
words.append(line.split(",")[0].strip())
with gzip.open("./00-frequency-list.csv.gz", "r") as infile:
for line in infile:
words.append(line.decode("ascii").split(",")[0])
for file in [
"./00-oxford-5000.txt",
"./00-desiquintans-nounlist.txt",
"./00-frequency-list.csv.gz",
]:
if file.endswith(".gz"):
with gzip.open(file, "r") as infile:
for line in infile:
words.append(line.decode("ascii").split(",")[0])
else:
with open(file, "r") as infile:
for line in infile:
words.append(line.split(",")[0].strip())
# Remove header
words = words[1:]
@ -36,7 +44,8 @@ print(words[0:5])
print("Lemmatizing words")
seen_lemmatizations = set()
# seen_lemmatizations = set()
seen_words = set()
with open("./01-errored-lemmatized-words.csv", "w") as erroutfile:
erroutfile.write("WORD,ATTEMPTED_LEMMATIZATION,LEMMATIZER\n")
@ -44,27 +53,33 @@ with open("./01-errored-lemmatized-words.csv", "w") as erroutfile:
with gzip.open("./01-lemmatized-words.csv.gz", "w") as outfile:
outfile.write("WORD,LEMMATIZED_WORD,LEMMATIZER\n".encode("ascii"))
# Make a progress bar so logs can be printed
iter = tqdm(words)
for word in iter:
# Zip (progress bar-ed) word list with nlp.pipe so nlp can process chunks at a time
for (word, spacy_word) in zip(iter, nlp.pipe(words)):
lemmatized_words = [
# Wordnet
(wordnet.lemmatize(word.lower()).upper(), "WORDNET"),
# Spacy
(nlp(word)[0].lemma_.upper().upper(), "SPACY"),
(spacy_word[0].lemma_.upper(), "SPACY"),
]
for lemmatized_word, lemmatizer in lemmatized_words:
if word == lemmatized_word:
continue
# if word == lemmatized_word:
# # This word is its own lemmatization
# continue
if (word, lemmatized_word) in seen_lemmatizations:
continue
# Skip words if we've already lemmatized them
# if (word, lemmatized_word) in seen_lemmatizations: continue
# seen_lemmatizations.add((word, lemmatized_word))
seen_lemmatizations.add((word, lemmatized_word))
# Skip words if they've already been added
if lemmatized_word in seen_words:
iter.write(f"{lemmatized_word} ({lemmatizer})\talready in seen_words")
continue
seen_words.add(lemmatized_word)
if lemmatized_word not in words:
iter.write(f"{lemmatized_word} ({lemmatizer}) not in all_words")
iter.write(f"{lemmatized_word} ({lemmatizer})\tnot in all_words")
erroutfile.write(f"{word},{lemmatized_word},{lemmatizer}\n")
continue

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff