Commit before erasing old wordlist code

This commit is contained in:
Austen Adler 2023-03-02 00:20:09 -05:00
parent 6ebe8cd489
commit 3ac59f35ed
3 changed files with 3222 additions and 4 deletions

View File

@ -1 +1,122 @@
word,lemmatized_word word,lemmatized_word
the,THE
of,OF
to,TO
in,IN
is,BE
that,THAT
for,FOR
be,BE
by,BY
with,WITH
on,ON
not,NOT
this,THIS
are,BE
at,AT
from,FROM
he,HE
which,WHICH
his,HIS
have,HAVE
an,AN
but,BUT
you,YOU
they,THEY
were,BE
had,HAVE
we,WE
all,ALL
one,ONE
their,THEIR
been,BE
will,WILL
there,THERE
can,CAN
if,IF
other,OTHER
would,WOULD
no,NO
her,SHE
may,MAY
more,MORE
when,WHEN
who,WHO
such,SUCH
these,THESE
any,ANY
she,SHE
new,NEW
time,TIME
than,THAN
do,DO
some,SOME
what,WHAT
only,ONLY
into,INTO
them,THEY
two,TWO
also,ALSO
about,ABOUT
out,OUT
him,HE
my,MY
said,SAY
up,UP
our,OUR
first,FIRST
should,SHOULD
under,UNDER
made,MAKE
state,STATE
see,SEE
after,AFTER
could,COULD
then,THEN
me,I
most,MOST
over,OVER
very,VERY
your,YOUR
between,BETWEEN
where,WHERE
now,NOW
shall,SHALL
work,WORK
those,THOSE
same,SAME
well,WELL
each,EACH
many,MANY
being,BE
years,YEAR
did,DO
year,YEAR
through,THROUGH
must,MUST
upon,UPON
before,BEFORE
like,LIKE
use,USE
part,PART
general,GENERAL
people,PEOPLE
because,BECAUSE
used,USE
how,HOW
even,EVEN
much,MUCH
states,STATE
during,DURING
both,BOTH
case,CASE
three,THREE
number,NUMBER
make,MAKE
per,PER
great,GREAT
act,ACT
way,WAY
life,LIFE
good,GOOD
day,DAY

1 word lemmatized_word
2 the THE
3 of OF
4 to TO
5 in IN
6 is BE
7 that THAT
8 for FOR
9 be BE
10 by BY
11 with WITH
12 on ON
13 not NOT
14 this THIS
15 are BE
16 at AT
17 from FROM
18 he HE
19 which WHICH
20 his HIS
21 have HAVE
22 an AN
23 but BUT
24 you YOU
25 they THEY
26 were BE
27 had HAVE
28 we WE
29 all ALL
30 one ONE
31 their THEIR
32 been BE
33 will WILL
34 there THERE
35 can CAN
36 if IF
37 other OTHER
38 would WOULD
39 no NO
40 her SHE
41 may MAY
42 more MORE
43 when WHEN
44 who WHO
45 such SUCH
46 these THESE
47 any ANY
48 she SHE
49 new NEW
50 time TIME
51 than THAN
52 do DO
53 some SOME
54 what WHAT
55 only ONLY
56 into INTO
57 them THEY
58 two TWO
59 also ALSO
60 about ABOUT
61 out OUT
62 him HE
63 my MY
64 said SAY
65 up UP
66 our OUR
67 first FIRST
68 should SHOULD
69 under UNDER
70 made MAKE
71 state STATE
72 see SEE
73 after AFTER
74 could COULD
75 then THEN
76 me I
77 most MOST
78 over OVER
79 very VERY
80 your YOUR
81 between BETWEEN
82 where WHERE
83 now NOW
84 shall SHALL
85 work WORK
86 those THOSE
87 same SAME
88 well WELL
89 each EACH
90 many MANY
91 being BE
92 years YEAR
93 did DO
94 year YEAR
95 through THROUGH
96 must MUST
97 upon UPON
98 before BEFORE
99 like LIKE
100 use USE
101 part PART
102 general GENERAL
103 people PEOPLE
104 because BECAUSE
105 used USE
106 how HOW
107 even EVEN
108 much MUCH
109 states STATE
110 during DURING
111 both BOTH
112 case CASE
113 three THREE
114 number NUMBER
115 make MAKE
116 per PER
117 great GREAT
118 act ACT
119 way WAY
120 life LIFE
121 good GOOD
122 day DAY

View File

@ -86,11 +86,18 @@ for m in list(sum(custom_maps, ())):
print("Step 5") print("Step 5")
# Lemmatize all words (plural -> singular) # Lemmatize all words (plural -> singular)
lemmatize_mappings = [(w, nlp(w)[0].lemma_) for w in tqdm(all_words)] # lemmatize_mappings = [(w, nlp(w)[0].lemma_) for w in tqdm(all_words)]
with open("01-lemmatized-words.csv", "w") as f: with open("01-lemmatized-words.csv", "w") as f:
f.write("word,lemmatized_word\n") f.write("word,lemmatized_word\n")
for w in lemmatize_mappings: iter = tqdm(all_words[:1000])
f.write(f"{w[0]},{w[1]}")
f.write("\n") for w in iter:
lemmatized_word = nlp(w)[0].lemma_.upper()
if lemmatized_word == w:
continue
if lemmatized_word not in all_words:
iter.write(f"{lemmatized_word} not in all_words")
f.write(f"{w},{lemmatized_word}\n")

3090
wordlist/02-exclude.txt Normal file

File diff suppressed because it is too large Load Diff