Many wordlist updates
This commit is contained in:
parent
e8fc755083
commit
964ff0a043
@ -3199,7 +3199,6 @@ JADE
|
||||
JAGUAR
|
||||
JAIL
|
||||
JAILHOUSE
|
||||
JALAPEÑO
|
||||
JAM
|
||||
JAR
|
||||
JASMINE
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -1,4 +1,14 @@
|
||||
WORD,ATTEMPTED_LEMMATIZATION,LEMMATIZER
|
||||
BELOVED,BELOVE,SPACY
|
||||
BLEED,BLEE,SPACY
|
||||
EMBED,EMBE,SPACY
|
||||
EXCEED,EXCEE,SPACY
|
||||
FORTHCOMING,FORTHCOME,SPACY
|
||||
NAKED,NAKE,SPACY
|
||||
NON-PROFIT,NON,SPACY
|
||||
ONGOING,ONGOE,SPACY
|
||||
RENOWNED,RENOWNE,SPACY
|
||||
SKILLED,SKILLE,SPACY
|
||||
ADVERTISING,ADVERTISE,SPACY
|
||||
BOSS,BOS,WORDNET
|
||||
COMICS,COMIC_STRIP,SPACY
|
||||
@ -24,15 +34,8 @@ LESS,LE,WORDNET
|
||||
THEMSELVES,THEMSELVE,SPACY
|
||||
PERHAPS,PERHAP,SPACY
|
||||
OURSELVES,OURSELVE,SPACY
|
||||
EXCEED,EXCEE,SPACY
|
||||
BLEED,BLEE,SPACY
|
||||
NAKED,NAKE,SPACY
|
||||
SKILLED,SKILLE,SPACY
|
||||
BELOVED,BELOVE,SPACY
|
||||
LEST,L,SPACY
|
||||
WICKED,WICKE,SPACY
|
||||
EMBED,EMBE,SPACY
|
||||
ONGOING,ONGOE,SPACY
|
||||
ASHAMED,ASHAME,SPACY
|
||||
CREED,CREE,SPACY
|
||||
VIS,VI,WORDNET
|
||||
@ -43,14 +46,11 @@ UNCHANGED,UNCHANGE,SPACY
|
||||
UNPUBLISHED,UNPUBLISHE,SPACY
|
||||
BIS,BI,WORDNET
|
||||
UNEMPLOYED,UNEMPLOYE,SPACY
|
||||
FORTHCOMING,FORTHCOME,SPACY
|
||||
METAPHYSICS,METAPHYSIC,SPACY
|
||||
UNAFFECTED,UNAFFECTE,SPACY
|
||||
RENOWNED,RENOWNE,SPACY
|
||||
TALENTED,TALENTE,SPACY
|
||||
UNFINISHED,UNFINISHE,SPACY
|
||||
MS,M,WORDNET
|
||||
AESTHETICS,AESTHETIC,WORDNET
|
||||
INFRARED,INFRARE,SPACY
|
||||
DISINTERESTED,DISINTERESTE,SPACY
|
||||
OS,O,WORDNET
|
||||
@ -244,7 +244,6 @@ GASWORKS,GASWORK,SPACY
|
||||
BULLETED,BULLETE,SPACY
|
||||
ARTEL,ROTL,WORDNET
|
||||
HEARTSTRINGS,HEARTSTRING,SPACY
|
||||
INCREMENTING,INCREMENTE,SPACY
|
||||
UNCLEARED,UNCLEARE,SPACY
|
||||
CONSOLS,CONSOL,SPACY
|
||||
MUDFLATS,MUDFLAT,SPACY
|
||||
@ -438,7 +437,6 @@ COSMONAUTICS,COSMONAUTIC,SPACY
|
||||
WHOLEGRAINS,WHOLEGRAIN,SPACY
|
||||
NEEDMENTS,NEEDMENT,SPACY
|
||||
ACHATES,ACHATE,SPACY
|
||||
PRECOMPILING,PRECOMPILE,SPACY
|
||||
BALUSTERED,BALUSTERE,SPACY
|
||||
JUGGINS,JUGGIN,SPACY
|
||||
UNCONFIGURED,UNCONFIGURE,SPACY
|
||||
@ -475,7 +473,6 @@ REEDING,REEDE,SPACY
|
||||
INTERCROSSING,INTERCROSSE,SPACY
|
||||
UNDEDUCTED,UNDEDUCTE,SPACY
|
||||
AGOGICS,AGOGIC,SPACY
|
||||
UNATTENDING,UNATTENDE,SPACY
|
||||
OVERMASTED,OVERMASTE,SPACY
|
||||
GILES,GILE,SPACY
|
||||
NONCOPYRIGHTED,NONCOPYRIGHTE,SPACY
|
||||
@ -523,7 +520,6 @@ EUPHRATES,EUPHRATE,SPACY
|
||||
TROWING,TROWE,SPACY
|
||||
LACEUPS,LACEUP,SPACY
|
||||
ALIPED,ALIPE,SPACY
|
||||
TALIPED,TALIPE,SPACY
|
||||
RAMSES,RAMSE,SPACY
|
||||
CENTRONICS,CENTRONIC,SPACY
|
||||
MITHRAS,MITHRA,WORDNET
|
||||
|
|
Binary file not shown.
@ -22,12 +22,20 @@ nlp = spacy.load("en_core_web_trf", disable=["parser", "ner"])
|
||||
print("Loading initial wordlist")
|
||||
|
||||
words = []
|
||||
with open("./00-dsiquintans-nounlist.txt", "r") as infile:
|
||||
for line in infile:
|
||||
words.append(line.split(",")[0].strip())
|
||||
with gzip.open("./00-frequency-list.csv.gz", "r") as infile:
|
||||
for line in infile:
|
||||
words.append(line.decode("ascii").split(",")[0])
|
||||
|
||||
for file in [
|
||||
"./00-oxford-5000.txt",
|
||||
"./00-desiquintans-nounlist.txt",
|
||||
"./00-frequency-list.csv.gz",
|
||||
]:
|
||||
if file.endswith(".gz"):
|
||||
with gzip.open(file, "r") as infile:
|
||||
for line in infile:
|
||||
words.append(line.decode("ascii").split(",")[0])
|
||||
else:
|
||||
with open(file, "r") as infile:
|
||||
for line in infile:
|
||||
words.append(line.split(",")[0].strip())
|
||||
|
||||
# Remove header
|
||||
words = words[1:]
|
||||
@ -36,7 +44,8 @@ print(words[0:5])
|
||||
|
||||
print("Lemmatizing words")
|
||||
|
||||
seen_lemmatizations = set()
|
||||
# seen_lemmatizations = set()
|
||||
seen_words = set()
|
||||
|
||||
with open("./01-errored-lemmatized-words.csv", "w") as erroutfile:
|
||||
erroutfile.write("WORD,ATTEMPTED_LEMMATIZATION,LEMMATIZER\n")
|
||||
@ -44,27 +53,33 @@ with open("./01-errored-lemmatized-words.csv", "w") as erroutfile:
|
||||
with gzip.open("./01-lemmatized-words.csv.gz", "w") as outfile:
|
||||
outfile.write("WORD,LEMMATIZED_WORD,LEMMATIZER\n".encode("ascii"))
|
||||
|
||||
# Make a progress bar so logs can be printed
|
||||
iter = tqdm(words)
|
||||
|
||||
for word in iter:
|
||||
# Zip (progress bar-ed) word list with nlp.pipe so nlp can process chunks at a time
|
||||
for (word, spacy_word) in zip(iter, nlp.pipe(words)):
|
||||
lemmatized_words = [
|
||||
# Wordnet
|
||||
(wordnet.lemmatize(word.lower()).upper(), "WORDNET"),
|
||||
# Spacy
|
||||
(nlp(word)[0].lemma_.upper().upper(), "SPACY"),
|
||||
(spacy_word[0].lemma_.upper(), "SPACY"),
|
||||
]
|
||||
|
||||
for lemmatized_word, lemmatizer in lemmatized_words:
|
||||
if word == lemmatized_word:
|
||||
continue
|
||||
# if word == lemmatized_word:
|
||||
# # This word is its own lemmatization
|
||||
# continue
|
||||
|
||||
if (word, lemmatized_word) in seen_lemmatizations:
|
||||
continue
|
||||
# Skip words if we've already lemmatized them
|
||||
# if (word, lemmatized_word) in seen_lemmatizations: continue
|
||||
# seen_lemmatizations.add((word, lemmatized_word))
|
||||
|
||||
seen_lemmatizations.add((word, lemmatized_word))
|
||||
# Skip words if they've already been added
|
||||
if lemmatized_word in seen_words:
|
||||
iter.write(f"{lemmatized_word} ({lemmatizer})\talready in seen_words")
|
||||
continue
|
||||
seen_words.add(lemmatized_word)
|
||||
|
||||
if lemmatized_word not in words:
|
||||
iter.write(f"{lemmatized_word} ({lemmatizer}) not in all_words")
|
||||
iter.write(f"{lemmatized_word} ({lemmatizer})\tnot in all_words")
|
||||
erroutfile.write(f"{word},{lemmatized_word},{lemmatizer}\n")
|
||||
continue
|
||||
|
||||
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user