Many wordlist updates
This commit is contained in:
parent
e8fc755083
commit
964ff0a043
@ -3199,7 +3199,6 @@ JADE
|
|||||||
JAGUAR
|
JAGUAR
|
||||||
JAIL
|
JAIL
|
||||||
JAILHOUSE
|
JAILHOUSE
|
||||||
JALAPEÑO
|
|
||||||
JAM
|
JAM
|
||||||
JAR
|
JAR
|
||||||
JASMINE
|
JASMINE
|
||||||
|
File diff suppressed because it is too large
Load Diff
@ -1,4 +1,14 @@
|
|||||||
WORD,ATTEMPTED_LEMMATIZATION,LEMMATIZER
|
WORD,ATTEMPTED_LEMMATIZATION,LEMMATIZER
|
||||||
|
BELOVED,BELOVE,SPACY
|
||||||
|
BLEED,BLEE,SPACY
|
||||||
|
EMBED,EMBE,SPACY
|
||||||
|
EXCEED,EXCEE,SPACY
|
||||||
|
FORTHCOMING,FORTHCOME,SPACY
|
||||||
|
NAKED,NAKE,SPACY
|
||||||
|
NON-PROFIT,NON,SPACY
|
||||||
|
ONGOING,ONGOE,SPACY
|
||||||
|
RENOWNED,RENOWNE,SPACY
|
||||||
|
SKILLED,SKILLE,SPACY
|
||||||
ADVERTISING,ADVERTISE,SPACY
|
ADVERTISING,ADVERTISE,SPACY
|
||||||
BOSS,BOS,WORDNET
|
BOSS,BOS,WORDNET
|
||||||
COMICS,COMIC_STRIP,SPACY
|
COMICS,COMIC_STRIP,SPACY
|
||||||
@ -24,15 +34,8 @@ LESS,LE,WORDNET
|
|||||||
THEMSELVES,THEMSELVE,SPACY
|
THEMSELVES,THEMSELVE,SPACY
|
||||||
PERHAPS,PERHAP,SPACY
|
PERHAPS,PERHAP,SPACY
|
||||||
OURSELVES,OURSELVE,SPACY
|
OURSELVES,OURSELVE,SPACY
|
||||||
EXCEED,EXCEE,SPACY
|
|
||||||
BLEED,BLEE,SPACY
|
|
||||||
NAKED,NAKE,SPACY
|
|
||||||
SKILLED,SKILLE,SPACY
|
|
||||||
BELOVED,BELOVE,SPACY
|
|
||||||
LEST,L,SPACY
|
LEST,L,SPACY
|
||||||
WICKED,WICKE,SPACY
|
WICKED,WICKE,SPACY
|
||||||
EMBED,EMBE,SPACY
|
|
||||||
ONGOING,ONGOE,SPACY
|
|
||||||
ASHAMED,ASHAME,SPACY
|
ASHAMED,ASHAME,SPACY
|
||||||
CREED,CREE,SPACY
|
CREED,CREE,SPACY
|
||||||
VIS,VI,WORDNET
|
VIS,VI,WORDNET
|
||||||
@ -43,14 +46,11 @@ UNCHANGED,UNCHANGE,SPACY
|
|||||||
UNPUBLISHED,UNPUBLISHE,SPACY
|
UNPUBLISHED,UNPUBLISHE,SPACY
|
||||||
BIS,BI,WORDNET
|
BIS,BI,WORDNET
|
||||||
UNEMPLOYED,UNEMPLOYE,SPACY
|
UNEMPLOYED,UNEMPLOYE,SPACY
|
||||||
FORTHCOMING,FORTHCOME,SPACY
|
|
||||||
METAPHYSICS,METAPHYSIC,SPACY
|
METAPHYSICS,METAPHYSIC,SPACY
|
||||||
UNAFFECTED,UNAFFECTE,SPACY
|
UNAFFECTED,UNAFFECTE,SPACY
|
||||||
RENOWNED,RENOWNE,SPACY
|
|
||||||
TALENTED,TALENTE,SPACY
|
TALENTED,TALENTE,SPACY
|
||||||
UNFINISHED,UNFINISHE,SPACY
|
UNFINISHED,UNFINISHE,SPACY
|
||||||
MS,M,WORDNET
|
MS,M,WORDNET
|
||||||
AESTHETICS,AESTHETIC,WORDNET
|
|
||||||
INFRARED,INFRARE,SPACY
|
INFRARED,INFRARE,SPACY
|
||||||
DISINTERESTED,DISINTERESTE,SPACY
|
DISINTERESTED,DISINTERESTE,SPACY
|
||||||
OS,O,WORDNET
|
OS,O,WORDNET
|
||||||
@ -244,7 +244,6 @@ GASWORKS,GASWORK,SPACY
|
|||||||
BULLETED,BULLETE,SPACY
|
BULLETED,BULLETE,SPACY
|
||||||
ARTEL,ROTL,WORDNET
|
ARTEL,ROTL,WORDNET
|
||||||
HEARTSTRINGS,HEARTSTRING,SPACY
|
HEARTSTRINGS,HEARTSTRING,SPACY
|
||||||
INCREMENTING,INCREMENTE,SPACY
|
|
||||||
UNCLEARED,UNCLEARE,SPACY
|
UNCLEARED,UNCLEARE,SPACY
|
||||||
CONSOLS,CONSOL,SPACY
|
CONSOLS,CONSOL,SPACY
|
||||||
MUDFLATS,MUDFLAT,SPACY
|
MUDFLATS,MUDFLAT,SPACY
|
||||||
@ -438,7 +437,6 @@ COSMONAUTICS,COSMONAUTIC,SPACY
|
|||||||
WHOLEGRAINS,WHOLEGRAIN,SPACY
|
WHOLEGRAINS,WHOLEGRAIN,SPACY
|
||||||
NEEDMENTS,NEEDMENT,SPACY
|
NEEDMENTS,NEEDMENT,SPACY
|
||||||
ACHATES,ACHATE,SPACY
|
ACHATES,ACHATE,SPACY
|
||||||
PRECOMPILING,PRECOMPILE,SPACY
|
|
||||||
BALUSTERED,BALUSTERE,SPACY
|
BALUSTERED,BALUSTERE,SPACY
|
||||||
JUGGINS,JUGGIN,SPACY
|
JUGGINS,JUGGIN,SPACY
|
||||||
UNCONFIGURED,UNCONFIGURE,SPACY
|
UNCONFIGURED,UNCONFIGURE,SPACY
|
||||||
@ -475,7 +473,6 @@ REEDING,REEDE,SPACY
|
|||||||
INTERCROSSING,INTERCROSSE,SPACY
|
INTERCROSSING,INTERCROSSE,SPACY
|
||||||
UNDEDUCTED,UNDEDUCTE,SPACY
|
UNDEDUCTED,UNDEDUCTE,SPACY
|
||||||
AGOGICS,AGOGIC,SPACY
|
AGOGICS,AGOGIC,SPACY
|
||||||
UNATTENDING,UNATTENDE,SPACY
|
|
||||||
OVERMASTED,OVERMASTE,SPACY
|
OVERMASTED,OVERMASTE,SPACY
|
||||||
GILES,GILE,SPACY
|
GILES,GILE,SPACY
|
||||||
NONCOPYRIGHTED,NONCOPYRIGHTE,SPACY
|
NONCOPYRIGHTED,NONCOPYRIGHTE,SPACY
|
||||||
@ -523,7 +520,6 @@ EUPHRATES,EUPHRATE,SPACY
|
|||||||
TROWING,TROWE,SPACY
|
TROWING,TROWE,SPACY
|
||||||
LACEUPS,LACEUP,SPACY
|
LACEUPS,LACEUP,SPACY
|
||||||
ALIPED,ALIPE,SPACY
|
ALIPED,ALIPE,SPACY
|
||||||
TALIPED,TALIPE,SPACY
|
|
||||||
RAMSES,RAMSE,SPACY
|
RAMSES,RAMSE,SPACY
|
||||||
CENTRONICS,CENTRONIC,SPACY
|
CENTRONICS,CENTRONIC,SPACY
|
||||||
MITHRAS,MITHRA,WORDNET
|
MITHRAS,MITHRA,WORDNET
|
||||||
|
|
Binary file not shown.
@ -22,12 +22,20 @@ nlp = spacy.load("en_core_web_trf", disable=["parser", "ner"])
|
|||||||
print("Loading initial wordlist")
|
print("Loading initial wordlist")
|
||||||
|
|
||||||
words = []
|
words = []
|
||||||
with open("./00-dsiquintans-nounlist.txt", "r") as infile:
|
|
||||||
for line in infile:
|
for file in [
|
||||||
words.append(line.split(",")[0].strip())
|
"./00-oxford-5000.txt",
|
||||||
with gzip.open("./00-frequency-list.csv.gz", "r") as infile:
|
"./00-desiquintans-nounlist.txt",
|
||||||
|
"./00-frequency-list.csv.gz",
|
||||||
|
]:
|
||||||
|
if file.endswith(".gz"):
|
||||||
|
with gzip.open(file, "r") as infile:
|
||||||
for line in infile:
|
for line in infile:
|
||||||
words.append(line.decode("ascii").split(",")[0])
|
words.append(line.decode("ascii").split(",")[0])
|
||||||
|
else:
|
||||||
|
with open(file, "r") as infile:
|
||||||
|
for line in infile:
|
||||||
|
words.append(line.split(",")[0].strip())
|
||||||
|
|
||||||
# Remove header
|
# Remove header
|
||||||
words = words[1:]
|
words = words[1:]
|
||||||
@ -36,7 +44,8 @@ print(words[0:5])
|
|||||||
|
|
||||||
print("Lemmatizing words")
|
print("Lemmatizing words")
|
||||||
|
|
||||||
seen_lemmatizations = set()
|
# seen_lemmatizations = set()
|
||||||
|
seen_words = set()
|
||||||
|
|
||||||
with open("./01-errored-lemmatized-words.csv", "w") as erroutfile:
|
with open("./01-errored-lemmatized-words.csv", "w") as erroutfile:
|
||||||
erroutfile.write("WORD,ATTEMPTED_LEMMATIZATION,LEMMATIZER\n")
|
erroutfile.write("WORD,ATTEMPTED_LEMMATIZATION,LEMMATIZER\n")
|
||||||
@ -44,27 +53,33 @@ with open("./01-errored-lemmatized-words.csv", "w") as erroutfile:
|
|||||||
with gzip.open("./01-lemmatized-words.csv.gz", "w") as outfile:
|
with gzip.open("./01-lemmatized-words.csv.gz", "w") as outfile:
|
||||||
outfile.write("WORD,LEMMATIZED_WORD,LEMMATIZER\n".encode("ascii"))
|
outfile.write("WORD,LEMMATIZED_WORD,LEMMATIZER\n".encode("ascii"))
|
||||||
|
|
||||||
|
# Make a progress bar so logs can be printed
|
||||||
iter = tqdm(words)
|
iter = tqdm(words)
|
||||||
|
|
||||||
for word in iter:
|
# Zip (progress bar-ed) word list with nlp.pipe so nlp can process chunks at a time
|
||||||
|
for (word, spacy_word) in zip(iter, nlp.pipe(words)):
|
||||||
lemmatized_words = [
|
lemmatized_words = [
|
||||||
# Wordnet
|
|
||||||
(wordnet.lemmatize(word.lower()).upper(), "WORDNET"),
|
(wordnet.lemmatize(word.lower()).upper(), "WORDNET"),
|
||||||
# Spacy
|
(spacy_word[0].lemma_.upper(), "SPACY"),
|
||||||
(nlp(word)[0].lemma_.upper().upper(), "SPACY"),
|
|
||||||
]
|
]
|
||||||
|
|
||||||
for lemmatized_word, lemmatizer in lemmatized_words:
|
for lemmatized_word, lemmatizer in lemmatized_words:
|
||||||
if word == lemmatized_word:
|
# if word == lemmatized_word:
|
||||||
continue
|
# # This word is its own lemmatization
|
||||||
|
# continue
|
||||||
|
|
||||||
if (word, lemmatized_word) in seen_lemmatizations:
|
# Skip words if we've already lemmatized them
|
||||||
continue
|
# if (word, lemmatized_word) in seen_lemmatizations: continue
|
||||||
|
# seen_lemmatizations.add((word, lemmatized_word))
|
||||||
|
|
||||||
seen_lemmatizations.add((word, lemmatized_word))
|
# Skip words if they've already been added
|
||||||
|
if lemmatized_word in seen_words:
|
||||||
|
iter.write(f"{lemmatized_word} ({lemmatizer})\talready in seen_words")
|
||||||
|
continue
|
||||||
|
seen_words.add(lemmatized_word)
|
||||||
|
|
||||||
if lemmatized_word not in words:
|
if lemmatized_word not in words:
|
||||||
iter.write(f"{lemmatized_word} ({lemmatizer}) not in all_words")
|
iter.write(f"{lemmatized_word} ({lemmatizer})\tnot in all_words")
|
||||||
erroutfile.write(f"{word},{lemmatized_word},{lemmatizer}\n")
|
erroutfile.write(f"{word},{lemmatized_word},{lemmatizer}\n")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue
Block a user