Use new wordlist generation logic
This commit is contained in:
parent
3ac59f35ed
commit
71404b8c6e
BIN
wordlist/00-frequency-list.csv.gz
Normal file
BIN
wordlist/00-frequency-list.csv.gz
Normal file
Binary file not shown.
501
wordlist/01-errored-lemmatized-words.csv
Normal file
501
wordlist/01-errored-lemmatized-words.csv
Normal file
@ -0,0 +1,501 @@
|
|||||||
|
WORD,ATTEMPTED_LEMMATIZATION,LEMMATIZER
|
||||||
|
THEMSELVES,THEMSELVE,SPACY
|
||||||
|
PERHAPS,PERHAP,SPACY
|
||||||
|
SERIES,SERIE,SPACY
|
||||||
|
OURSELVES,OURSELVE,SPACY
|
||||||
|
EXCEED,EXCEE,SPACY
|
||||||
|
BLEED,BLEE,SPACY
|
||||||
|
MATHEMATICS,MATHEMATIC,SPACY
|
||||||
|
NAKED,NAKE,SPACY
|
||||||
|
SKILLED,SKILLE,SPACY
|
||||||
|
BELOVED,BELOVE,SPACY
|
||||||
|
LEST,L,SPACY
|
||||||
|
WICKED,WICKE,SPACY
|
||||||
|
EMBED,EMBE,SPACY
|
||||||
|
DIABETES,DIABETE,SPACY
|
||||||
|
ONGOING,ONGOE,SPACY
|
||||||
|
ASHAMED,ASHAME,SPACY
|
||||||
|
CREED,CREE,SPACY
|
||||||
|
SINNER,SINN,SPACY
|
||||||
|
INDEBTED,INDEBTE,SPACY
|
||||||
|
UNCHANGED,UNCHANGE,SPACY
|
||||||
|
UNPUBLISHED,UNPUBLISHE,SPACY
|
||||||
|
UNEMPLOYED,UNEMPLOYE,SPACY
|
||||||
|
FORTHCOMING,FORTHCOME,SPACY
|
||||||
|
METAPHYSICS,METAPHYSIC,SPACY
|
||||||
|
TROUSERS,TROUSER,SPACY
|
||||||
|
UNAFFECTED,UNAFFECTE,SPACY
|
||||||
|
RENOWNED,RENOWNE,SPACY
|
||||||
|
TALENTED,TALENTE,SPACY
|
||||||
|
GREED,GREE,SPACY
|
||||||
|
UNFINISHED,UNFINISHE,SPACY
|
||||||
|
AESTHETICS,AESTHETIC,SPACY
|
||||||
|
INFRARED,INFRARE,SPACY
|
||||||
|
DISINTERESTED,DISINTERESTE,SPACY
|
||||||
|
UNNOTICED,UNNOTICE,SPACY
|
||||||
|
TING,TE,SPACY
|
||||||
|
ANNALS,ANNAL,SPACY
|
||||||
|
OUTSKIRTS,OUTSKIRT,SPACY
|
||||||
|
DETEST,DET,SPACY
|
||||||
|
FETTER,FETT,SPACY
|
||||||
|
SIDEWAYS,SIDEWAY,SPACY
|
||||||
|
ALMS,ALM,SPACY
|
||||||
|
MEASLES,MEASLE,SPACY
|
||||||
|
UNRESTRICTED,UNRESTRICTE,SPACY
|
||||||
|
ARREARS,ARREAR,SPACY
|
||||||
|
UNDEVELOPED,UNDEVELOPE,SPACY
|
||||||
|
CARIES,CARIE,SPACY
|
||||||
|
MORES,MORE,SPACY
|
||||||
|
UNALTERED,UNALTERE,SPACY
|
||||||
|
UNPROTECTED,UNPROTECTE,SPACY
|
||||||
|
UNEDUCATED,UNEDUCATE,SPACY
|
||||||
|
GALLOWS,GALLOW,SPACY
|
||||||
|
UNDATED,UNDATE,SPACY
|
||||||
|
UNNAMED,UNNAME,SPACY
|
||||||
|
MONIES,MONIE,SPACY
|
||||||
|
UNAIDED,UNAIDE,SPACY
|
||||||
|
UNQUESTIONED,UNQUESTIONE,SPACY
|
||||||
|
IMBED,IMBE,SPACY
|
||||||
|
AMINES,AMINE,SPACY
|
||||||
|
GRASSROOTS,GRASSROOT,SPACY
|
||||||
|
ACCURSED,ACCURSE,SPACY
|
||||||
|
UNDECIDED,UNDECIDE,SPACY
|
||||||
|
UNCHECKED,UNCHECKE,SPACY
|
||||||
|
UNCOMPLICATED,UNCOMPLICATE,SPACY
|
||||||
|
BELATED,BELATE,SPACY
|
||||||
|
UNDETERMINED,UNDETERMINE,SPACY
|
||||||
|
EAVES,EAVE,SPACY
|
||||||
|
DISAFFECTED,DISAFFECT,SPACY
|
||||||
|
UNFULFILLED,UNFULFILLE,SPACY
|
||||||
|
STRIATED,STRIATE,SPACY
|
||||||
|
RICKETS,RICKET,SPACY
|
||||||
|
BICEPS,BICEP,SPACY
|
||||||
|
DEATHBED,DEATHBE,SPACY
|
||||||
|
RABIES,RABIE,SPACY
|
||||||
|
UNATTENDED,UNATTENDE,SPACY
|
||||||
|
UNABATED,UNABATE,SPACY
|
||||||
|
MANNERED,MANNERE,SPACY
|
||||||
|
FAECES,FAECE,SPACY
|
||||||
|
QUADRUPED,QUADRUPE,SPACY
|
||||||
|
UNSTRUCTURED,UNSTRUCTURE,SPACY
|
||||||
|
UNAVAILING,UNAVAILE,SPACY
|
||||||
|
SUBSPECIES,SUBSPECIE,SPACY
|
||||||
|
UNDETECTED,UNDETECTE,SPACY
|
||||||
|
UNPLANNED,UNPLANNE,SPACY
|
||||||
|
UNCONDITIONED,UNCONDITIONE,SPACY
|
||||||
|
INBRED,INBREED,SPACY
|
||||||
|
TONGS,TONG,SPACY
|
||||||
|
BIGOTED,BIGOTE,SPACY
|
||||||
|
ENTRAILS,ENTRAIL,SPACY
|
||||||
|
UNEQUALLED,UNEQUALLE,SPACY
|
||||||
|
ONCOMING,ONCOME,SPACY
|
||||||
|
UNTHINKING,UNTHINKE,SPACY
|
||||||
|
MENSES,MENSE,SPACY
|
||||||
|
UNMITIGATED,UNMITIGATE,SPACY
|
||||||
|
UNRECORDED,UNRECORDE,SPACY
|
||||||
|
SEMIOTICS,SEMIOTIC,SPACY
|
||||||
|
UNACKNOWLEDGED,UNACKNOWLEDGE,SPACY
|
||||||
|
UNDISGUISED,UNDISGUISE,SPACY
|
||||||
|
PYRITES,PYRITE,SPACY
|
||||||
|
UNOBSTRUCTED,UNOBSTRUCTE,SPACY
|
||||||
|
UNATTACHED,UNATTACHE,SPACY
|
||||||
|
UNMATCHED,UNMATCHE,SPACY
|
||||||
|
PLIERS,PLIER,SPACY
|
||||||
|
ENAMORED,ENAMOR,SPACY
|
||||||
|
PANTIES,PANTIE,SPACY
|
||||||
|
BACKWOODS,BACKWOOD,SPACY
|
||||||
|
UNPROVOKED,UNPROVOKE,SPACY
|
||||||
|
TRICEPS,TRICEP,SPACY
|
||||||
|
UNCHARTED,UNCHARTE,SPACY
|
||||||
|
MALNOURISHED,MALNOURISH,SPACY
|
||||||
|
MONEYED,MONEYE,SPACY
|
||||||
|
UNTAMED,UNTAME,SPACY
|
||||||
|
METHYLATED,METHYLATE,SPACY
|
||||||
|
UNRELIEVED,UNRELIEVE,SPACY
|
||||||
|
UNLETTERED,UNLETTERE,SPACY
|
||||||
|
HOTBED,HOTBE,SPACY
|
||||||
|
UNIMPROVED,UNIMPROVE,SPACY
|
||||||
|
LOPSIDED,LOPSIDE,SPACY
|
||||||
|
LEGGING,LEGGE,SPACY
|
||||||
|
UNFEIGNED,UNFEIGNE,SPACY
|
||||||
|
UNTESTED,UNTESTE,SPACY
|
||||||
|
UNBLEMISHED,UNBLEMISHE,SPACY
|
||||||
|
TECHNICS,TECHNIC,SPACY
|
||||||
|
UNTAINTED,UNTAINTE,SPACY
|
||||||
|
UNREGISTERED,UNREGISTERE,SPACY
|
||||||
|
UNFORMED,UNFORME,SPACY
|
||||||
|
OVERWEENING,OVERWEENE,SPACY
|
||||||
|
UNPROVIDED,UNPROVIDE,SPACY
|
||||||
|
FOOTLIGHTS,FOOTLIGHT,SPACY
|
||||||
|
UNCONVERTED,UNCONVERTE,SPACY
|
||||||
|
OBSEQUIES,OBSEQUIE,SPACY
|
||||||
|
PINCERS,PINCER,SPACY
|
||||||
|
MALADJUSTED,MALADJUSTE,SPACY
|
||||||
|
ISOSCELES,ISOSCELE,SPACY
|
||||||
|
UNPROVED,UNPROVE,SPACY
|
||||||
|
AMIDSHIPS,AMIDSHIP,SPACY
|
||||||
|
SEMISKILLED,SEMISKILLE,SPACY
|
||||||
|
UNDIRECTED,UNDIRECTE,SPACY
|
||||||
|
TABES,TABE,SPACY
|
||||||
|
UNCLAIMED,UNCLAIME,SPACY
|
||||||
|
UNPOLISHED,UNPOLISHE,SPACY
|
||||||
|
FARSIGHTED,FARSIGHTE,SPACY
|
||||||
|
FAUCES,FAUCE,SPACY
|
||||||
|
NONCOMMISSIONED,NONCOMMISSIONE,SPACY
|
||||||
|
UNCHARGED,UNCHARGE,SPACY
|
||||||
|
CONGERIES,CONGERIE,SPACY
|
||||||
|
SCABIES,SCABIE,SPACY
|
||||||
|
MALFORMED,MALFORME,SPACY
|
||||||
|
INFORMATICS,INFORMATIC,SPACY
|
||||||
|
INCREMENTED,INCREMENTE,SPACY
|
||||||
|
UNDISTRIBUTED,UNDISTRIBUTE,SPACY
|
||||||
|
HYDRODYNAMICS,HYDRODYNAMIC,SPACY
|
||||||
|
ANTIPODES,ANTIPODE,SPACY
|
||||||
|
UNDEREMPLOYED,UNDEREMPLOYE,SPACY
|
||||||
|
BIPED,BIPE,SPACY
|
||||||
|
ELECTRODYNAMICS,ELECTRODYNAMIC,SPACY
|
||||||
|
FEEBLEMINDED,FEEBLEMINDE,SPACY
|
||||||
|
SUDS,SUD,SPACY
|
||||||
|
UNDERSIZED,UNDERSIZE,SPACY
|
||||||
|
HUSTINGS,HUSTING,SPACY
|
||||||
|
STOREYED,STOREYE,SPACY
|
||||||
|
UNREFINED,UNREFINE,SPACY
|
||||||
|
UNTURNED,UNTURNE,SPACY
|
||||||
|
SERRIED,SERRIE,SPACY
|
||||||
|
DOLDRUMS,DOLDRUM,SPACY
|
||||||
|
STATS,STAT,SPACY
|
||||||
|
GULES,GULE,SPACY
|
||||||
|
UNDERPANTS,UNDERPANT,SPACY
|
||||||
|
UNREWARDING,UNREWARDE,SPACY
|
||||||
|
CALIPERS,CALIPER,SPACY
|
||||||
|
ONESIDED,ONESIDE,SPACY
|
||||||
|
UNABRIDGED,UNABRIDGE,SPACY
|
||||||
|
UNFURNISHED,UNFURNISHE,SPACY
|
||||||
|
UNREDEEMED,UNREDEEME,SPACY
|
||||||
|
UNSEEING,UNSEEE,SPACY
|
||||||
|
KNICKERS,KNICKER,SPACY
|
||||||
|
UNLISTED,UNLISTE,SPACY
|
||||||
|
INNARDS,INNARD,SPACY
|
||||||
|
GLANDERS,GLANDER,SPACY
|
||||||
|
OVEREXTENDED,OVEREXTEND,SPACY
|
||||||
|
RELATIVIZED,RELATIVIZE,SPACY
|
||||||
|
UNFUNDED,UNFUNDE,SPACY
|
||||||
|
COLUMNED,COLUMNE,SPACY
|
||||||
|
CALISTHENICS,CALISTHENIC,SPACY
|
||||||
|
SUPERFICIES,SUPERFICIE,SPACY
|
||||||
|
CASTELLATED,CASTELLATE,SPACY
|
||||||
|
PUREBRED,PUREBRE,SPACY
|
||||||
|
CORTES,CORTE,SPACY
|
||||||
|
REDHEADED,REDHEADE,SPACY
|
||||||
|
CASTANETS,CASTANET,SPACY
|
||||||
|
UNPRACTISED,UNPRACTISE,SPACY
|
||||||
|
UNEDITED,UNEDITE,SPACY
|
||||||
|
LIVERIED,LIVERIE,SPACY
|
||||||
|
NOSEBLEED,NOSEBLEE,SPACY
|
||||||
|
UNDERFUNDED,UNDERFUNDE,SPACY
|
||||||
|
UNGRADED,UNGRADE,SPACY
|
||||||
|
UNREDUCED,UNREDUCE,SPACY
|
||||||
|
SIDEBURNS,SIDEBURN,SPACY
|
||||||
|
SICKBED,SICKBE,SPACY
|
||||||
|
FASCES,FASCE,SPACY
|
||||||
|
AVIONICS,AVIONIC,SPACY
|
||||||
|
CRENELATED,CRENELATE,SPACY
|
||||||
|
NATES,NATE,SPACY
|
||||||
|
UNREGARDED,UNREGARDE,SPACY
|
||||||
|
UNRECONSTRUCTED,UNRECONSTRUCTE,SPACY
|
||||||
|
BRITCHES,BRITCHE,SPACY
|
||||||
|
DEDANS,DEDAN,SPACY
|
||||||
|
PARATROOPS,PARATROOP,SPACY
|
||||||
|
FINEGRAINED,FINEGRAINE,SPACY
|
||||||
|
GREAVES,GREAVE,SPACY
|
||||||
|
SUBSCRIPTED,SUBSCRIPTE,SPACY
|
||||||
|
LUES,LUE,SPACY
|
||||||
|
BIOMETRICS,BIOMETRIC,SPACY
|
||||||
|
ARIES,ARIE,SPACY
|
||||||
|
GASWORKS,GASWORK,SPACY
|
||||||
|
BULLETED,BULLETE,SPACY
|
||||||
|
HEARTSTRINGS,HEARTSTRING,SPACY
|
||||||
|
INCREMENTING,INCREMENTE,SPACY
|
||||||
|
UNCLEARED,UNCLEARE,SPACY
|
||||||
|
CONSOLS,CONSOL,SPACY
|
||||||
|
MUDFLATS,MUDFLAT,SPACY
|
||||||
|
BADLANDS,BADLAND,SPACY
|
||||||
|
TALIPES,TALIPE,SPACY
|
||||||
|
LANCINATING,LANCINATE,SPACY
|
||||||
|
UNACCOMPLISHED,UNACCOMPLISHE,SPACY
|
||||||
|
TESSELLATED,TESSELLATE,SPACY
|
||||||
|
SEMIFINISHED,SEMIFINISHE,SPACY
|
||||||
|
UNAVENGED,UNAVENGE,SPACY
|
||||||
|
SOAPSUDS,SOAPSUD,SPACY
|
||||||
|
YEATS,YEAT,SPACY
|
||||||
|
TELEMATICS,TELEMATIC,SPACY
|
||||||
|
UNNOTED,UNNOTE,SPACY
|
||||||
|
MEALIES,MEALIE,SPACY
|
||||||
|
PARIES,PARIE,SPACY
|
||||||
|
AUROCHS,AUROCH,SPACY
|
||||||
|
AGOING,AGOE,SPACY
|
||||||
|
ODDMENTS,ODDMENT,SPACY
|
||||||
|
CLEARHEADED,CLEARHEADE,SPACY
|
||||||
|
REDOUBTED,REDOUBTE,SPACY
|
||||||
|
IVIED,IVIE,SPACY
|
||||||
|
PINNIPED,PINNIPE,SPACY
|
||||||
|
DEFATTED,DEFATTE,SPACY
|
||||||
|
DECAFFEINATED,DECAFFEINATE,SPACY
|
||||||
|
NINEPINS,NINEPIN,SPACY
|
||||||
|
CAMPHORATED,CAMPHORATE,SPACY
|
||||||
|
GLASSWORKS,GLASSWORK,SPACY
|
||||||
|
SORITES,SORITE,SPACY
|
||||||
|
AFFOREST,AFFOR,SPACY
|
||||||
|
DISSAVING,DISSAVE,SPACY
|
||||||
|
UNADVISED,UNADVISE,SPACY
|
||||||
|
UNRECLAIMED,UNRECLAIME,SPACY
|
||||||
|
LARES,LARE,SPACY
|
||||||
|
LEVELHEADED,LEVELHEADE,SPACY
|
||||||
|
SWEATPANTS,SWEATPANT,SPACY
|
||||||
|
LOTOS,LOTO,SPACY
|
||||||
|
GIBLETS,GIBLET,SPACY
|
||||||
|
UNBOWED,UNBOWE,SPACY
|
||||||
|
UNPROMPTED,UNPROMPTE,SPACY
|
||||||
|
ABSCESSED,ABSCESSE,SPACY
|
||||||
|
NODULATED,NODULATE,SPACY
|
||||||
|
RUBENS,RUBEN,SPACY
|
||||||
|
UNPAGED,UNPAGE,SPACY
|
||||||
|
CALENDS,CALEND,SPACY
|
||||||
|
TRUNKED,TRUNKE,SPACY
|
||||||
|
TROUSERED,TROUSERE,SPACY
|
||||||
|
PENATES,PENATE,SPACY
|
||||||
|
COMBINATORICS,COMBINATORIC,SPACY
|
||||||
|
TRESSED,TRESSE,SPACY
|
||||||
|
PARTICOLOURED,PARTICOLOURE,SPACY
|
||||||
|
UNENCRYPTED,UNENCRYPTE,SPACY
|
||||||
|
ASTRONAUTICS,ASTRONAUTIC,SPACY
|
||||||
|
HYDROPONICS,HYDROPONIC,SPACY
|
||||||
|
UNFORMATTED,UNFORMATTE,SPACY
|
||||||
|
SEMIDETACHED,SEMIDETACHE,SPACY
|
||||||
|
BONKERS,BONKER,SPACY
|
||||||
|
UNDIES,UNDIE,SPACY
|
||||||
|
EPS,EP,SPACY
|
||||||
|
GIMBALS,GIMBAL,SPACY
|
||||||
|
BALCONIED,BALCONIE,SPACY
|
||||||
|
SALTWORKS,SALTWORK,SPACY
|
||||||
|
UNPLEDGED,UNPLEDGE,SPACY
|
||||||
|
PREDESIGNED,PREDESIGNE,SPACY
|
||||||
|
NFS,NF,SPACY
|
||||||
|
UNDERBRED,UNDERBRE,SPACY
|
||||||
|
PRECOMPILED,PRECOMPILE,SPACY
|
||||||
|
KALENDS,KALEND,SPACY
|
||||||
|
LITOTES,LITOTE,SPACY
|
||||||
|
INDIGESTED,INDIGESTE,SPACY
|
||||||
|
CITS,CIT,SPACY
|
||||||
|
UNPRICED,UNPRICE,SPACY
|
||||||
|
PINCHERS,PINCHER,SPACY
|
||||||
|
CANCELLATED,CANCELLATE,SPACY
|
||||||
|
CHITTERLINGS,CHITTERLING,SPACY
|
||||||
|
DIBS,DIB,SPACY
|
||||||
|
RIGHTWARDS,RIGHTWARD,SPACY
|
||||||
|
CONVENANCES,CONVENANCE,SPACY
|
||||||
|
INTERALLIED,INTERALLIE,SPACY
|
||||||
|
FLINDERS,FLINDER,SPACY
|
||||||
|
CRANNIED,CRANNIE,SPACY
|
||||||
|
HOMEBRED,HOMEBRE,SPACY
|
||||||
|
HIGHBRED,HIGHBRE,SPACY
|
||||||
|
UNRULED,UNRULE,SPACY
|
||||||
|
FOREHANDED,FOREHANDE,SPACY
|
||||||
|
PREPACKED,PREPACKE,SPACY
|
||||||
|
UNWISHED,UNWISHE,SPACY
|
||||||
|
ENTREMETS,ENTREMET,SPACY
|
||||||
|
ESTOVERS,ESTOVER,SPACY
|
||||||
|
ANGELES,ANGELE,SPACY
|
||||||
|
DAISIED,DAISIE,SPACY
|
||||||
|
UPRATED,UPRATE,SPACY
|
||||||
|
THIGHED,THIGHE,SPACY
|
||||||
|
TURPS,TURP,SPACY
|
||||||
|
WEAZENED,WEAZENE,SPACY
|
||||||
|
EFFING,EFF,SPACY
|
||||||
|
HOLS,HOL,SPACY
|
||||||
|
JIGGERED,JIGGERE,SPACY
|
||||||
|
SOCRATES,SOCRATE,SPACY
|
||||||
|
AUDITORIES,AUDITORIE,SPACY
|
||||||
|
AMBAGES,AMBAGE,SPACY
|
||||||
|
DOITED,DOITE,SPACY
|
||||||
|
BIONICS,BIONIC,SPACY
|
||||||
|
UNREFERENCED,UNREFERENCE,SPACY
|
||||||
|
EXEQUIES,EXEQUIE,SPACY
|
||||||
|
CERASTES,CERASTE,SPACY
|
||||||
|
SEMIMANUFACTURES,SEMIMANUFACTURE,SPACY
|
||||||
|
GALLUSES,GALLUS,SPACY
|
||||||
|
RERECORDED,RERECORDE,SPACY
|
||||||
|
TELESALES,TELESALE,SPACY
|
||||||
|
MICROGRAPHICS,MICROGRAPHIC,SPACY
|
||||||
|
SIEMENS,SIEMEN,SPACY
|
||||||
|
ZOUNDS,ZOUND,SPACY
|
||||||
|
SEMIFIXED,SEMIFIXE,SPACY
|
||||||
|
UNDIVERTED,UNDIVERTE,SPACY
|
||||||
|
SANIES,SANIE,SPACY
|
||||||
|
BREECHING,BREECHE,SPACY
|
||||||
|
MENTHOLATED,MENTHOLATE,SPACY
|
||||||
|
PANTALETS,PANTALET,SPACY
|
||||||
|
CRUDITES,CRUDITE,SPACY
|
||||||
|
TRAPES,TRAPE,SPACY
|
||||||
|
PIXILATED,PIXILATE,SPACY
|
||||||
|
BOOTES,BOOTE,SPACY
|
||||||
|
UNPOSTED,UNPOSTE,SPACY
|
||||||
|
HANTS,HANT,SPACY
|
||||||
|
UNDETAILED,UNDETAILE,SPACY
|
||||||
|
HAVINGS,HAVING,SPACY
|
||||||
|
OUTGIVING,OUTGIVE,SPACY
|
||||||
|
UNCOMPLEMENTED,UNCOMPLEMENTE,SPACY
|
||||||
|
PRATIES,PRATIE,SPACY
|
||||||
|
ELEVENSES,ELEVENSE,SPACY
|
||||||
|
UNENLIVENED,UNENLIVENE,SPACY
|
||||||
|
NANTES,NANTE,SPACY
|
||||||
|
AFFINED,AFFINE,SPACY
|
||||||
|
NONNESTED,NONNESTE,SPACY
|
||||||
|
FALLOWING,FALLOWE,SPACY
|
||||||
|
HYDROMECHANICS,HYDROMECHANIC,SPACY
|
||||||
|
CLIVERS,CLIVER,SPACY
|
||||||
|
UNICES,UNICE,SPACY
|
||||||
|
GRAMMATICS,GRAMMATIC,SPACY
|
||||||
|
PRAPS,PRAP,SPACY
|
||||||
|
INTERWORKING,INTERWORKE,SPACY
|
||||||
|
HERCULES,HERCULE,SPACY
|
||||||
|
BIGHEADED,BIGHEADE,SPACY
|
||||||
|
KIES,KY,SPACY
|
||||||
|
NETHERLANDS,NETHERLAND,SPACY
|
||||||
|
UNBOOKED,UNBOOKE,SPACY
|
||||||
|
QUINS,QUIN,SPACY
|
||||||
|
CANNES,CANNE,SPACY
|
||||||
|
UNNURTURED,UNNURTURE,SPACY
|
||||||
|
WEDGIES,WEDGIE,SPACY
|
||||||
|
HANDWORKED,HANDWORKE,SPACY
|
||||||
|
ANALECTS,ANALECT,SPACY
|
||||||
|
HERTS,HERT,SPACY
|
||||||
|
ORLEANS,ORLEAN,SPACY
|
||||||
|
PESCADORES,PESCADORE,SPACY
|
||||||
|
ULCERED,ULCERE,SPACY
|
||||||
|
MISCREATED,MISCREATE,SPACY
|
||||||
|
UNPRIZED,UNPRIZE,SPACY
|
||||||
|
SLYBOOTS,SLYBOOT,SPACY
|
||||||
|
RUNTED,RUNTE,SPACY
|
||||||
|
REATTRIBUTED,REATTRIBUTE,SPACY
|
||||||
|
HOUSETRAINED,HOUSETRAINE,SPACY
|
||||||
|
SOBERSIDES,SOBERSIDE,SPACY
|
||||||
|
COLESEED,COLESEE,SPACY
|
||||||
|
BLUCHERS,BLUCHER,SPACY
|
||||||
|
MUGGINS,MUGGIN,SPACY
|
||||||
|
UNCRIPPLED,UNCRIPPLE,SPACY
|
||||||
|
HEPPED,HEPPE,SPACY
|
||||||
|
WITHINDOORS,WITHINDOOR,SPACY
|
||||||
|
BEESTINGS,BEESTING,SPACY
|
||||||
|
FLANDERS,FLANDER,SPACY
|
||||||
|
DIOGENES,DIOGENE,SPACY
|
||||||
|
COSMONAUTICS,COSMONAUTIC,SPACY
|
||||||
|
WHOLEGRAINS,WHOLEGRAIN,SPACY
|
||||||
|
NEEDMENTS,NEEDMENT,SPACY
|
||||||
|
ACHATES,ACHATE,SPACY
|
||||||
|
PRECOMPILING,PRECOMPILE,SPACY
|
||||||
|
BALUSTERED,BALUSTERE,SPACY
|
||||||
|
JUGGINS,JUGGIN,SPACY
|
||||||
|
UNCONFIGURED,UNCONFIGURE,SPACY
|
||||||
|
SLUGABED,SLUGABE,SPACY
|
||||||
|
CHARGRILLED,CHARGRILLE,SPACY
|
||||||
|
GANGES,GANGE,SPACY
|
||||||
|
FLATWAYS,FLATWAY,SPACY
|
||||||
|
CHAMPERS,CHAMPER,SPACY
|
||||||
|
GOLDILOCKS,GOLDILOCK,SPACY
|
||||||
|
REIMS,REIM,SPACY
|
||||||
|
REIMPORTING,REIMPORTE,SPACY
|
||||||
|
EMOTIONED,EMOTIONE,SPACY
|
||||||
|
AIRBED,AIRBE,SPACY
|
||||||
|
GIGAFLOPS,GIGAFLOP,SPACY
|
||||||
|
YONKS,YONK,SPACY
|
||||||
|
CASALS,CASAL,SPACY
|
||||||
|
ROCKIES,ROCKIE,SPACY
|
||||||
|
ORESTES,ORESTE,SPACY
|
||||||
|
REMAPPING,REMAPPE,SPACY
|
||||||
|
EBONICS,EBONIC,SPACY
|
||||||
|
BRUGES,BRUGE,SPACY
|
||||||
|
JANKERS,JANKER,SPACY
|
||||||
|
NOTTS,NOTT,SPACY
|
||||||
|
PROCRUSTES,PROCRUSTE,SPACY
|
||||||
|
MULTISCALED,MULTISCALE,SPACY
|
||||||
|
AGROTECHNICS,AGROTECHNIC,SPACY
|
||||||
|
WAYGOING,WAYGOE,SPACY
|
||||||
|
GENDERING,GENDERE,SPACY
|
||||||
|
TELEMECHANICS,TELEMECHANIC,SPACY
|
||||||
|
DEGATING,DEGATE,SPACY
|
||||||
|
THAMES,THAME,SPACY
|
||||||
|
LOWLIVED,LOWLIVE,SPACY
|
||||||
|
REEDING,REEDE,SPACY
|
||||||
|
INTERCROSSING,INTERCROSSE,SPACY
|
||||||
|
UNDEDUCTED,UNDEDUCTE,SPACY
|
||||||
|
AGOGICS,AGOGIC,SPACY
|
||||||
|
UNATTENDING,UNATTENDE,SPACY
|
||||||
|
OVERMASTED,OVERMASTE,SPACY
|
||||||
|
GILES,GILE,SPACY
|
||||||
|
NONCOPYRIGHTED,NONCOPYRIGHTE,SPACY
|
||||||
|
LUDDITES,LUDDITE,SPACY
|
||||||
|
SCURVIED,SCURVIE,SPACY
|
||||||
|
REBREAKING,REBREAKE,SPACY
|
||||||
|
KEATS,KEAT,SPACY
|
||||||
|
CERVANTES,CERVANTE,SPACY
|
||||||
|
UNCONDONED,UNCONDONE,SPACY
|
||||||
|
DESCARTES,DESCARTE,SPACY
|
||||||
|
BEJABERS,BEJABER,SPACY
|
||||||
|
VIDEOGRAPHICS,VIDEOGRAPHIC,SPACY
|
||||||
|
EURIPIDES,EURIPIDE,SPACY
|
||||||
|
UNPERJURED,UNPERJURE,SPACY
|
||||||
|
LAERTES,LAERTE,SPACY
|
||||||
|
OVERCOLLECTED,OVERCOLLECTE,SPACY
|
||||||
|
AMPHIBRACHYS,AMPHIBRACHY,SPACY
|
||||||
|
CHEOPS,CHEOP,SPACY
|
||||||
|
CHALONS,CHALON,SPACY
|
||||||
|
VERSICOLOURED,VERSICOLOURE,SPACY
|
||||||
|
SUBPARTITIONED,SUBPARTITIONE,SPACY
|
||||||
|
BALBUTIES,BALBUTIE,SPACY
|
||||||
|
ARCHIMEDES,ARCHIMEDE,SPACY
|
||||||
|
GATELEGGED,GATELEGGE,SPACY
|
||||||
|
POITIERS,POITIER,SPACY
|
||||||
|
HAVERING,HAVERE,SPACY
|
||||||
|
THEBES,THEBE,SPACY
|
||||||
|
SEVRES,SEVRE,SPACY
|
||||||
|
PERICLES,PERICLE,SPACY
|
||||||
|
LIMOGES,LIMOGE,SPACY
|
||||||
|
EVENTING,EVENTE,SPACY
|
||||||
|
FATBITS,FATBIT,SPACY
|
||||||
|
HUTTING,HUTTE,SPACY
|
||||||
|
DOGSHORES,DOGSHORE,SPACY
|
||||||
|
OVERBADING,OVERBADE,SPACY
|
||||||
|
AZORES,AZORE,SPACY
|
||||||
|
BLEWITS,BLEWIT,SPACY
|
||||||
|
HIPOCRATES,HIPOCRATE,SPACY
|
||||||
|
AMIENS,AMIEN,SPACY
|
||||||
|
GUTTING,GUTTE,SPACY
|
||||||
|
GLADYS,GLADY,SPACY
|
||||||
|
CHADDED,CHADDE,SPACY
|
||||||
|
EUPHRATES,EUPHRATE,SPACY
|
||||||
|
TROWING,TROWE,SPACY
|
||||||
|
LACEUPS,LACEUP,SPACY
|
||||||
|
ALIPED,ALIPE,SPACY
|
||||||
|
TALIPED,TALIPE,SPACY
|
||||||
|
RAMSES,RAMSE,SPACY
|
||||||
|
CENTRONICS,CENTRONIC,SPACY
|
||||||
|
BANTING,BANTE,SPACY
|
||||||
|
TELEPHOTOLENS,TELEPHOTOLEN,SPACY
|
||||||
|
ARAKS,ARAK,SPACY
|
||||||
|
DONETS,DONET,SPACY
|
||||||
|
CEROPLASTICS,CEROPLASTIC,SPACY
|
||||||
|
BAYNETWORKS,BAYNETWORK,SPACY
|
||||||
|
NORWARDS,NORWARD,SPACY
|
||||||
|
HAPPING,HAPPE,SPACY
|
||||||
|
BARENTS,BARENT,SPACY
|
||||||
|
ABLINGS,ABLING,SPACY
|
||||||
|
CELLING,CELLE,SPACY
|
||||||
|
CELEBES,CELEBE,SPACY
|
||||||
|
NENETS,NENET,SPACY
|
||||||
|
IMPING,IMPE,SPACY
|
||||||
|
LINARES,LINARE,SPACY
|
||||||
|
VAILING,VAILE,SPACY
|
||||||
|
HABDABS,HABDAB,SPACY
|
||||||
|
RELISTING,RELISTE,SPACY
|
||||||
|
HOUGHING,HOUGHE,SPACY
|
|
@ -1,122 +0,0 @@
|
|||||||
word,lemmatized_word
|
|
||||||
the,THE
|
|
||||||
of,OF
|
|
||||||
to,TO
|
|
||||||
in,IN
|
|
||||||
is,BE
|
|
||||||
that,THAT
|
|
||||||
for,FOR
|
|
||||||
be,BE
|
|
||||||
by,BY
|
|
||||||
with,WITH
|
|
||||||
on,ON
|
|
||||||
not,NOT
|
|
||||||
this,THIS
|
|
||||||
are,BE
|
|
||||||
at,AT
|
|
||||||
from,FROM
|
|
||||||
he,HE
|
|
||||||
which,WHICH
|
|
||||||
his,HIS
|
|
||||||
have,HAVE
|
|
||||||
an,AN
|
|
||||||
but,BUT
|
|
||||||
you,YOU
|
|
||||||
they,THEY
|
|
||||||
were,BE
|
|
||||||
had,HAVE
|
|
||||||
we,WE
|
|
||||||
all,ALL
|
|
||||||
one,ONE
|
|
||||||
their,THEIR
|
|
||||||
been,BE
|
|
||||||
will,WILL
|
|
||||||
there,THERE
|
|
||||||
can,CAN
|
|
||||||
if,IF
|
|
||||||
other,OTHER
|
|
||||||
would,WOULD
|
|
||||||
no,NO
|
|
||||||
her,SHE
|
|
||||||
may,MAY
|
|
||||||
more,MORE
|
|
||||||
when,WHEN
|
|
||||||
who,WHO
|
|
||||||
such,SUCH
|
|
||||||
these,THESE
|
|
||||||
any,ANY
|
|
||||||
she,SHE
|
|
||||||
new,NEW
|
|
||||||
time,TIME
|
|
||||||
than,THAN
|
|
||||||
do,DO
|
|
||||||
some,SOME
|
|
||||||
what,WHAT
|
|
||||||
only,ONLY
|
|
||||||
into,INTO
|
|
||||||
them,THEY
|
|
||||||
two,TWO
|
|
||||||
also,ALSO
|
|
||||||
about,ABOUT
|
|
||||||
out,OUT
|
|
||||||
him,HE
|
|
||||||
my,MY
|
|
||||||
said,SAY
|
|
||||||
up,UP
|
|
||||||
our,OUR
|
|
||||||
first,FIRST
|
|
||||||
should,SHOULD
|
|
||||||
under,UNDER
|
|
||||||
made,MAKE
|
|
||||||
state,STATE
|
|
||||||
see,SEE
|
|
||||||
after,AFTER
|
|
||||||
could,COULD
|
|
||||||
then,THEN
|
|
||||||
me,I
|
|
||||||
most,MOST
|
|
||||||
over,OVER
|
|
||||||
very,VERY
|
|
||||||
your,YOUR
|
|
||||||
between,BETWEEN
|
|
||||||
where,WHERE
|
|
||||||
now,NOW
|
|
||||||
shall,SHALL
|
|
||||||
work,WORK
|
|
||||||
those,THOSE
|
|
||||||
same,SAME
|
|
||||||
well,WELL
|
|
||||||
each,EACH
|
|
||||||
many,MANY
|
|
||||||
being,BE
|
|
||||||
years,YEAR
|
|
||||||
did,DO
|
|
||||||
year,YEAR
|
|
||||||
through,THROUGH
|
|
||||||
must,MUST
|
|
||||||
upon,UPON
|
|
||||||
before,BEFORE
|
|
||||||
like,LIKE
|
|
||||||
use,USE
|
|
||||||
part,PART
|
|
||||||
general,GENERAL
|
|
||||||
people,PEOPLE
|
|
||||||
because,BECAUSE
|
|
||||||
used,USE
|
|
||||||
how,HOW
|
|
||||||
even,EVEN
|
|
||||||
much,MUCH
|
|
||||||
states,STATE
|
|
||||||
during,DURING
|
|
||||||
both,BOTH
|
|
||||||
case,CASE
|
|
||||||
three,THREE
|
|
||||||
number,NUMBER
|
|
||||||
make,MAKE
|
|
||||||
per,PER
|
|
||||||
great,GREAT
|
|
||||||
act,ACT
|
|
||||||
way,WAY
|
|
||||||
life,LIFE
|
|
||||||
good,GOOD
|
|
||||||
day,DAY
|
|
|
BIN
wordlist/01-lemmatized-words.csv.gz
Normal file
BIN
wordlist/01-lemmatized-words.csv.gz
Normal file
Binary file not shown.
@ -1,103 +1,70 @@
|
|||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
# coding: utf-8
|
# coding: utf-8
|
||||||
|
|
||||||
print("Step 1")
|
print("Loading dependencies")
|
||||||
|
|
||||||
|
|
||||||
try:
|
|
||||||
_initialized
|
|
||||||
except:
|
|
||||||
# !pip install spacy
|
|
||||||
# !python -m spacy download en_core_web_trf
|
|
||||||
import spacy
|
import spacy
|
||||||
|
import nltk
|
||||||
from tqdm import tqdm
|
from tqdm import tqdm
|
||||||
|
import gzip
|
||||||
|
|
||||||
|
# Wordnet
|
||||||
|
try:
|
||||||
|
from nltk.stem.wordnet import WordNetLemmatizer
|
||||||
|
except:
|
||||||
|
nltk.download("wordnet")
|
||||||
|
from nltk.stem.wordnet import WordNetLemmatizer
|
||||||
|
wordnet = WordNetLemmatizer()
|
||||||
|
|
||||||
|
# Spacy
|
||||||
nlp = spacy.load('en_core_web_trf', disable=['parser', 'ner'])
|
nlp = spacy.load('en_core_web_trf', disable=['parser', 'ner'])
|
||||||
|
|
||||||
_initialized=True
|
print("Loading initial wordlist")
|
||||||
|
|
||||||
import pandas as pd
|
words = []
|
||||||
import gzip
|
with gzip.open("./00-frequency-list.csv.gz", 'r') as infile:
|
||||||
import re
|
for line in infile:
|
||||||
|
words.append(line.decode('ascii').split(",")[0])
|
||||||
|
|
||||||
|
# Remove header
|
||||||
|
words = words[1:]
|
||||||
|
|
||||||
print("Step 2")
|
print(words[0:5])
|
||||||
|
|
||||||
|
print("Lemmatizing words")
|
||||||
|
|
||||||
def get_lines(filename):
|
seen_lemmatizations = set()
|
||||||
with gzip.open(filename, 'r') as f:
|
|
||||||
ret = []
|
|
||||||
for l in f:
|
|
||||||
if len(ret) > 30_000:
|
|
||||||
return ret
|
|
||||||
ret.append(str(l).lower())
|
|
||||||
return ret
|
|
||||||
|
|
||||||
|
with open("./01-errored-lemmatized-words.csv", 'w') as erroutfile:
|
||||||
|
erroutfile.write("WORD,ATTEMPTED_LEMMATIZATION,LEMMATIZER\n")
|
||||||
|
|
||||||
|
with gzip.open("./01-lemmatized-words.csv.gz", 'w') as outfile:
|
||||||
|
outfile.write("WORD,LEMMATIZED_WORD,LEMMATIZER\n".encode("ascii"))
|
||||||
|
|
||||||
WORDLIST_SIZE = 8192 + 3
|
iter = tqdm(words)
|
||||||
word_re = re.compile(r"^[A-Za-z]+$")
|
|
||||||
|
|
||||||
|
for word in iter:
|
||||||
print("Step 3")
|
lemmatized_words = [
|
||||||
|
# Wordnet
|
||||||
|
(wordnet.lemmatize(word).upper(), 'WORDNET'),
|
||||||
annotated_words=pd.read_excel("annotated_words.ods")
|
# Spacy
|
||||||
|
(nlp(word)[0].lemma_.upper().upper(), 'SPACY'),
|
||||||
excluded_words = list(annotated_words[annotated_words["keep"] != "Yes"]["word"].str.lower())
|
|
||||||
excluded_words[0:10]
|
|
||||||
|
|
||||||
custom_maps = annotated_words[annotated_words["maps_to"].notna()][["word","maps_to"]].assign(maps_to=lambda x: x["maps_to"].map(lambda y: y.split(",")))
|
|
||||||
|
|
||||||
custom_maps = [
|
|
||||||
(m[1]["word"].lower(), mapping.lower())
|
|
||||||
for m in custom_maps.iterrows()
|
|
||||||
for mapping in m[1]["maps_to"]
|
|
||||||
]
|
]
|
||||||
custom_maps
|
|
||||||
|
|
||||||
|
for (lemmatized_word, lemmatizer) in lemmatized_words:
|
||||||
print("Step 4")
|
if word == lemmatized_word:
|
||||||
|
|
||||||
|
|
||||||
# Start parsing the wordlist
|
|
||||||
all_words = get_lines("00-frequency-all.txt.gz")
|
|
||||||
|
|
||||||
# Delete header line
|
|
||||||
all_words = all_words[1:]
|
|
||||||
|
|
||||||
# Get only the word (fixed width)
|
|
||||||
all_words = [w[13:36].strip() for w in all_words]
|
|
||||||
|
|
||||||
# Remove special characters
|
|
||||||
all_words = [w for w in all_words if word_re.search(w)]
|
|
||||||
|
|
||||||
# Remove all removed words
|
|
||||||
all_words = [w for w in all_words if w not in excluded_words]
|
|
||||||
|
|
||||||
# Add all custom mappings
|
|
||||||
for m in list(sum(custom_maps, ())):
|
|
||||||
if m[0] not in all_words:
|
|
||||||
all_words.append(m[0])
|
|
||||||
if m[1] not in all_words:
|
|
||||||
all_words.append(m[1])
|
|
||||||
|
|
||||||
|
|
||||||
print("Step 5")
|
|
||||||
|
|
||||||
# Lemmatize all words (plural -> singular)
|
|
||||||
# lemmatize_mappings = [(w, nlp(w)[0].lemma_) for w in tqdm(all_words)]
|
|
||||||
|
|
||||||
with open("01-lemmatized-words.csv", "w") as f:
|
|
||||||
f.write("word,lemmatized_word\n")
|
|
||||||
|
|
||||||
iter = tqdm(all_words[:1000])
|
|
||||||
|
|
||||||
for w in iter:
|
|
||||||
lemmatized_word = nlp(w)[0].lemma_.upper()
|
|
||||||
if lemmatized_word == w:
|
|
||||||
continue
|
continue
|
||||||
if lemmatized_word not in all_words:
|
|
||||||
iter.write(f"{lemmatized_word} not in all_words")
|
|
||||||
|
|
||||||
f.write(f"{w},{lemmatized_word}\n")
|
if (word, lemmatized_word) in seen_lemmatizations:
|
||||||
|
continue
|
||||||
|
|
||||||
|
seen_lemmatizations.add((word, lemmatized_word))
|
||||||
|
|
||||||
|
if lemmatized_word not in words:
|
||||||
|
iter.write(f"{lemmatized_word} ({lemmatizer}) not in all_words")
|
||||||
|
erroutfile.write(f"{word},{lemmatized_word},{lemmatizer}\n")
|
||||||
|
continue
|
||||||
|
|
||||||
|
iter.write(f"{word} => {lemmatized_word} ({lemmatizer}) added")
|
||||||
|
|
||||||
|
outfile.write(f"{word},{lemmatized_word},{lemmatizer}\n".encode("ascii"))
|
||||||
|
1285
wordlist/01-lemmatized-words.txt
Normal file
1285
wordlist/01-lemmatized-words.txt
Normal file
File diff suppressed because it is too large
Load Diff
103
wordlist/02-custom-lemmatizations.csv
Normal file
103
wordlist/02-custom-lemmatizations.csv
Normal file
@ -0,0 +1,103 @@
|
|||||||
|
ADD,ADDS
|
||||||
|
ADS,ADDS
|
||||||
|
AFFECTED,EFFECT
|
||||||
|
AFFECT,EFFECT
|
||||||
|
AFFECTIONS,AFFECTION
|
||||||
|
AFFECTIVE,EFFECT
|
||||||
|
AFFECTS,EFFECT
|
||||||
|
ALUMINIUM,ALUMINUM
|
||||||
|
ALUMINUM,ALUMINIUM
|
||||||
|
ANALYSE,ANALYZE
|
||||||
|
ANALYSED,ANALYZE
|
||||||
|
ANALYSES,ANALYZE
|
||||||
|
AUX,OX
|
||||||
|
BE,BEE
|
||||||
|
BERRY,BARRY
|
||||||
|
BLEW,BLUE
|
||||||
|
BOT,BOUGHT
|
||||||
|
BOULDER,BOLDER
|
||||||
|
BRINGS,BRING
|
||||||
|
BY,BYE
|
||||||
|
CAPITOL,CAPITAL
|
||||||
|
CENTS,SENSE
|
||||||
|
CHILE,CHILI
|
||||||
|
CHILE,CHILLY
|
||||||
|
COLOURLESS,COLORLESS
|
||||||
|
COM,CALM
|
||||||
|
CORP,CORE
|
||||||
|
CORPS,CORE
|
||||||
|
CUE,QUEUE
|
||||||
|
DAZE,DAY
|
||||||
|
DAZED,DAY
|
||||||
|
DEAR,DEER
|
||||||
|
DESSERT,DESERT
|
||||||
|
DEW,DO
|
||||||
|
DEW,DUE
|
||||||
|
DIED,DYED
|
||||||
|
EFFECTIVE,EFFECT
|
||||||
|
EFFECTS,EFFECT
|
||||||
|
ELECTRONICS,ELECTRONIC
|
||||||
|
FAVOUR,FAVOR
|
||||||
|
FAX,FACTS
|
||||||
|
FILING,FILLING
|
||||||
|
FILINGS,FILLING
|
||||||
|
FORTUNATELY,FORTUNATE
|
||||||
|
FOUR,FOR
|
||||||
|
GRATE,GREAT
|
||||||
|
HAIRY,HARRY
|
||||||
|
HARRY,HAIRY
|
||||||
|
HEIR,HAIR
|
||||||
|
HEIRS,HAIR
|
||||||
|
HEM,HIM
|
||||||
|
HONOUR,HONOR
|
||||||
|
HONOURS,HONORS
|
||||||
|
HYMN,HIM
|
||||||
|
HYMNS,HIM
|
||||||
|
IMPROVES,IMPROVE
|
||||||
|
ISLE,AISLE
|
||||||
|
KNIGHT,NIGHT
|
||||||
|
KNOT,NOT
|
||||||
|
KNOTS,NOT
|
||||||
|
LARVAE,LARVA
|
||||||
|
LECTURER,LECTURE
|
||||||
|
MANOR,MANNER
|
||||||
|
MONIES,MONEYS
|
||||||
|
NEIGHBOURHOOD,NEIGHBORHOOD
|
||||||
|
NEIGHBOUR,NEIGHBOR
|
||||||
|
NEIGHBOURS,NEIGHBOR
|
||||||
|
NOSE,KNOW
|
||||||
|
NUN,NONE
|
||||||
|
ORE,OAR
|
||||||
|
ORE,OR
|
||||||
|
ORGANISATIONAL,ORGANIZATIONAL
|
||||||
|
ORGANISATION,ORGANIZATION
|
||||||
|
ORGANISATIONS,ORGANIZATION
|
||||||
|
OWE,OH
|
||||||
|
PAR,PARSE
|
||||||
|
PARS,PARSE
|
||||||
|
PEOPLES,PEOPLE
|
||||||
|
PER,PURR
|
||||||
|
PETAL,PEDAL
|
||||||
|
PROVIDES,PROVIDE
|
||||||
|
RAP,WRAP
|
||||||
|
REFORMED,REFORM
|
||||||
|
SCENT,CENT
|
||||||
|
SCENTS,CENT
|
||||||
|
SENSE,CENT
|
||||||
|
SENSED,CENT
|
||||||
|
SENSES,CENT
|
||||||
|
SIMULTANEOUSLY,SIMULTANEOUS
|
||||||
|
TELECOMMUNICATIONS,TELECOMMUNICATION
|
||||||
|
THEATRES,THEATER
|
||||||
|
THEATRE,THEATER
|
||||||
|
THRU,THROUGH
|
||||||
|
VAPOUR,VAPOR
|
||||||
|
VARY,VERY
|
||||||
|
VERTEBRA,VERTEBRAE
|
||||||
|
WEARY,WARY
|
||||||
|
WEIGHS,WAY
|
||||||
|
WEIGH,WAY
|
||||||
|
YELLOW,HELLO
|
||||||
|
CACHE,CASH
|
||||||
|
BYTE,BITE
|
||||||
|
COUNSELLOR,COUNSELOR
|
|
File diff suppressed because it is too large
Load Diff
8851
wordlist/04-deduplicated-words.csv
Normal file
8851
wordlist/04-deduplicated-words.csv
Normal file
File diff suppressed because it is too large
Load Diff
126
wordlist/04-deduplicated-words.py
Executable file
126
wordlist/04-deduplicated-words.py
Executable file
@ -0,0 +1,126 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
# coding: utf-8
|
||||||
|
|
||||||
|
import gzip
|
||||||
|
from pprint import pprint
|
||||||
|
from tqdm import tqdm
|
||||||
|
|
||||||
|
# 2**13 + 2 since two can be skipped
|
||||||
|
WORDLIST_SIZE=8192+2
|
||||||
|
|
||||||
|
print("Loading full wordlist")
|
||||||
|
|
||||||
|
all_words = []
|
||||||
|
with gzip.open("./00-frequency-list.csv.gz", 'r') as infile:
|
||||||
|
first = True
|
||||||
|
for line in infile:
|
||||||
|
if first:
|
||||||
|
first = False
|
||||||
|
continue
|
||||||
|
all_words.append(line.decode('ascii').split(",")[0])
|
||||||
|
|
||||||
|
print("Building lemmatization graph")
|
||||||
|
|
||||||
|
lemmatization_graph = list()
|
||||||
|
def add_lemmatization(word1, word2):
|
||||||
|
for lemmatization in lemmatization_graph:
|
||||||
|
word1_contained = word1 in lemmatization
|
||||||
|
word2_contained = word2 in lemmatization
|
||||||
|
|
||||||
|
if word1_contained or word2_contained:
|
||||||
|
if word1_contained and word2_contained:
|
||||||
|
print(f"Warning: lemmatization {word1}<=>{word2} already in set: {lemmatization}")
|
||||||
|
|
||||||
|
lemmatization.add(word1)
|
||||||
|
lemmatization.add(word2)
|
||||||
|
|
||||||
|
# Success. We added the words
|
||||||
|
return
|
||||||
|
else:
|
||||||
|
# This lemmatization doesn't contain either. This is the common case
|
||||||
|
pass
|
||||||
|
|
||||||
|
# If we get here, there is no known lemmatization between these two. Add it
|
||||||
|
lemmatization_graph.append(set((word1, word2)))
|
||||||
|
|
||||||
|
def get_lemmatization(word):
|
||||||
|
for lemmatization in lemmatization_graph:
|
||||||
|
if word in lemmatization:
|
||||||
|
return lemmatization
|
||||||
|
|
||||||
|
print("\tAdding automatic lemmatizations")
|
||||||
|
# First, iterate over automated lemmatizations
|
||||||
|
with gzip.open("./01-lemmatized-words.csv.gz") as infile:
|
||||||
|
first = True
|
||||||
|
for line in infile:
|
||||||
|
if first:
|
||||||
|
first = False
|
||||||
|
continue
|
||||||
|
split = line.decode('ascii').strip().split(",")
|
||||||
|
add_lemmatization(split[0], split[1])
|
||||||
|
|
||||||
|
print("\tAdding custom lemmatizations")
|
||||||
|
# Next, iterate over manual lemmatizations
|
||||||
|
with open("./02-custom-lemmatizations.csv") as infile:
|
||||||
|
first = True
|
||||||
|
for line in infile:
|
||||||
|
if first:
|
||||||
|
first = False
|
||||||
|
continue
|
||||||
|
split = line.strip().split(",")
|
||||||
|
add_lemmatization(split[0], split[1])
|
||||||
|
|
||||||
|
print("Lemmatization graph constructed:")
|
||||||
|
pprint(lemmatization_graph)
|
||||||
|
|
||||||
|
print("Loading exclude wordlist")
|
||||||
|
with open("./03-exclude.csv") as infile:
|
||||||
|
first = True
|
||||||
|
exclude_words = set()
|
||||||
|
for line in infile:
|
||||||
|
if first:
|
||||||
|
first = False
|
||||||
|
continue
|
||||||
|
exclude_words.add(line.strip())
|
||||||
|
|
||||||
|
# Now, start printing the first WORDLIST_SIZE elements
|
||||||
|
seen_word_lemmatizations = set()
|
||||||
|
final_wordlist = []
|
||||||
|
ending_word_index = 0
|
||||||
|
for word in all_words:
|
||||||
|
ending_word_index += 1
|
||||||
|
|
||||||
|
word_lemmatizations = get_lemmatization(word)
|
||||||
|
|
||||||
|
if not word_lemmatizations:
|
||||||
|
word_lemmatizations = set([word])
|
||||||
|
|
||||||
|
if len(word_lemmatizations - exclude_words) != len(word_lemmatizations) :
|
||||||
|
print(f"Note: {word_lemmatizations} is excluded")
|
||||||
|
continue
|
||||||
|
|
||||||
|
if word_lemmatizations in seen_word_lemmatizations:
|
||||||
|
# We already added this one
|
||||||
|
continue
|
||||||
|
|
||||||
|
final_wordlist.append(word_lemmatizations)
|
||||||
|
|
||||||
|
if len(final_wordlist) >= WORDLIST_SIZE:
|
||||||
|
# We've added all the words we need
|
||||||
|
break
|
||||||
|
|
||||||
|
assert len(final_wordlist) == WORDLIST_SIZE
|
||||||
|
pprint(list(enumerate(final_wordlist)))
|
||||||
|
print(f"Ending index: {ending_word_index}")
|
||||||
|
|
||||||
|
final_wordlist = [
|
||||||
|
(idx + 1, word)
|
||||||
|
for idx, words in enumerate(final_wordlist)
|
||||||
|
for word in words
|
||||||
|
]
|
||||||
|
|
||||||
|
with open("./04-deduplicated-words.csv", 'w') as outfile:
|
||||||
|
outfile.write("WORD,NUMBER\n")
|
||||||
|
for (idx, word) in final_wordlist:
|
||||||
|
outfile.write(f"{word},{idx}\n")
|
||||||
|
# all_words.append(line.decode('ascii').split(",")[0])
|
Binary file not shown.
@ -6,6 +6,7 @@ asttokens==2.2.1
|
|||||||
attrs==22.2.0
|
attrs==22.2.0
|
||||||
backcall==0.2.0
|
backcall==0.2.0
|
||||||
beautifulsoup4==4.11.2
|
beautifulsoup4==4.11.2
|
||||||
|
black==23.1.0
|
||||||
bleach==6.0.0
|
bleach==6.0.0
|
||||||
blis==0.7.9
|
blis==0.7.9
|
||||||
catalogue==2.0.8
|
catalogue==2.0.8
|
||||||
@ -33,6 +34,7 @@ ipywidgets==8.0.4
|
|||||||
isoduration==20.11.0
|
isoduration==20.11.0
|
||||||
jedi==0.18.2
|
jedi==0.18.2
|
||||||
Jinja2==3.1.2
|
Jinja2==3.1.2
|
||||||
|
joblib==1.2.0
|
||||||
jsonpointer==2.3
|
jsonpointer==2.3
|
||||||
jsonschema==4.17.3
|
jsonschema==4.17.3
|
||||||
jupyter==1.0.0
|
jupyter==1.0.0
|
||||||
@ -49,11 +51,13 @@ MarkupSafe==2.1.2
|
|||||||
matplotlib-inline==0.1.6
|
matplotlib-inline==0.1.6
|
||||||
mistune==2.0.5
|
mistune==2.0.5
|
||||||
murmurhash==1.0.9
|
murmurhash==1.0.9
|
||||||
|
mypy-extensions==1.0.0
|
||||||
nbclassic==0.5.2
|
nbclassic==0.5.2
|
||||||
nbclient==0.7.2
|
nbclient==0.7.2
|
||||||
nbconvert==7.2.9
|
nbconvert==7.2.9
|
||||||
nbformat==5.7.3
|
nbformat==5.7.3
|
||||||
nest-asyncio==1.5.6
|
nest-asyncio==1.5.6
|
||||||
|
nltk==3.8.1
|
||||||
notebook==6.5.2
|
notebook==6.5.2
|
||||||
notebook_shim==0.2.2
|
notebook_shim==0.2.2
|
||||||
numpy==1.24.2
|
numpy==1.24.2
|
||||||
@ -66,6 +70,7 @@ packaging==23.0
|
|||||||
pandas==1.5.3
|
pandas==1.5.3
|
||||||
pandocfilters==1.5.0
|
pandocfilters==1.5.0
|
||||||
parso==0.8.3
|
parso==0.8.3
|
||||||
|
pathspec==0.11.0
|
||||||
pathy==0.10.1
|
pathy==0.10.1
|
||||||
pexpect==4.8.0
|
pexpect==4.8.0
|
||||||
pickleshare==0.7.5
|
pickleshare==0.7.5
|
||||||
|
File diff suppressed because it is too large
Load Diff
@ -1,220 +0,0 @@
|
|||||||
{
|
|
||||||
"cells": [
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 1,
|
|
||||||
"id": "991a711f-be98-4aae-a657-84b065449916",
|
|
||||||
"metadata": {
|
|
||||||
"tags": []
|
|
||||||
},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"try:\n",
|
|
||||||
" _initialized\n",
|
|
||||||
"except:\n",
|
|
||||||
" # !pip install spacy\n",
|
|
||||||
" # !python -m spacy download en_core_web_trf\n",
|
|
||||||
" import spacy\n",
|
|
||||||
" \n",
|
|
||||||
" nlp = spacy.load('en_core_web_trf', disable=['parser', 'ner'])\n",
|
|
||||||
" \n",
|
|
||||||
" _initialized=True\n",
|
|
||||||
" \n",
|
|
||||||
"import pandas as pd\n",
|
|
||||||
"import gzip\n",
|
|
||||||
"import re"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 2,
|
|
||||||
"id": "d130bb84",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"def get_lines(filename):\n",
|
|
||||||
" with gzip.open(filename, 'r') as f:\n",
|
|
||||||
" ret = []\n",
|
|
||||||
" for l in f:\n",
|
|
||||||
" if len(ret) > 30_000:\n",
|
|
||||||
" return ret\n",
|
|
||||||
" ret.append(str(l).lower())\n",
|
|
||||||
" return ret\n",
|
|
||||||
"\n",
|
|
||||||
"\n",
|
|
||||||
" \n",
|
|
||||||
"WORDLIST_SIZE = 8192 + 3\n",
|
|
||||||
"word_re = re.compile(r\"^[A-Za-z]+$\")"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 3,
|
|
||||||
"id": "de2d1731",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"!pwd\n",
|
|
||||||
"!ls"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 4,
|
|
||||||
"id": "90665714",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"annotated_words=pd.read_excel(\"annotated_words.ods\")\n",
|
|
||||||
"\n",
|
|
||||||
"excluded_words = list(annotated_words[annotated_words[\"keep\"] != \"Yes\"][\"word\"].str.lower())\n",
|
|
||||||
"excluded_words[0:10]\n",
|
|
||||||
"\n",
|
|
||||||
"custom_maps = annotated_words[annotated_words[\"maps_to\"].notna()][[\"word\",\"maps_to\"]].assign(maps_to=lambda x: x[\"maps_to\"].map(lambda y: y.split(\",\")))\n",
|
|
||||||
"\n",
|
|
||||||
"custom_maps = [\n",
|
|
||||||
" (m[1][\"word\"].lower(), mapping.lower())\n",
|
|
||||||
" for m in custom_maps.iterrows()\n",
|
|
||||||
" for mapping in m[1][\"maps_to\"]\n",
|
|
||||||
"]\n",
|
|
||||||
"custom_maps"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 5,
|
|
||||||
"id": "fb50c69e",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"# Start parsing the wordlist\n",
|
|
||||||
"all_words = get_lines(\"00-frequency-all.txt.gz\")\n",
|
|
||||||
"\n",
|
|
||||||
"# Delete header line\n",
|
|
||||||
"all_words = all_words[1:]\n",
|
|
||||||
"\n",
|
|
||||||
"# Get only the word (fixed width)\n",
|
|
||||||
"all_words = [w[13:36].strip() for w in all_words]\n",
|
|
||||||
"\n",
|
|
||||||
"# Remove special characters\n",
|
|
||||||
"all_words = [w for w in all_words if word_re.search(w)]\n",
|
|
||||||
"\n",
|
|
||||||
"# Remove all removed words\n",
|
|
||||||
"all_words = [w for w in all_words if w not in excluded_words]\n",
|
|
||||||
"\n",
|
|
||||||
"# Add all custom mappings\n",
|
|
||||||
"for m in list(sum(custom_maps, ())):\n",
|
|
||||||
" if m[0] not in all_words:\n",
|
|
||||||
" all_words.append(m[0])\n",
|
|
||||||
" if m[1] not in all_words:\n",
|
|
||||||
" all_words.append(m[1])"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 13,
|
|
||||||
"id": "cd21bff5",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"# Lemmatize all words (plural -> singular)\n",
|
|
||||||
"lemmatize_mappings = [(w, nlp(w)[0].lemma_) for w in all_words[:100]]\n",
|
|
||||||
"print(lemmatize_mappings[:100])\n",
|
|
||||||
"\n",
|
|
||||||
"# Add custom lemmatizations\n",
|
|
||||||
"for l in custom_maps:\n",
|
|
||||||
" if l in lemmatize_mappings:\n",
|
|
||||||
" print(f\"Warning: {l} is already lemmatized\")\n",
|
|
||||||
" else:\n",
|
|
||||||
" lemmatize_mappings.append(l)\n",
|
|
||||||
" \n",
|
|
||||||
"print(lemmatize_mappings[:100])\n",
|
|
||||||
"\n",
|
|
||||||
"lemmatize_mappings = [w for w in lemmatize_mappings if w[1] not in excluded_words]\n",
|
|
||||||
"print(lemmatize_mappings[:100])\n",
|
|
||||||
"\n",
|
|
||||||
"# Now, re-add all lematized words to the list of every word\n",
|
|
||||||
"for w in sum(lemmatize_mappings, ()):\n",
|
|
||||||
" if w not in all_words:\n",
|
|
||||||
" print(w)\n",
|
|
||||||
" all_words.append(w)\n",
|
|
||||||
" \n",
|
|
||||||
"lemmatize_mappings = {k: v for k, v in lemmatize_mappings}"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 7,
|
|
||||||
"id": "0ee9af7d",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"final_wordlist = []\n",
|
|
||||||
"seen_lemmatizations = set()\n",
|
|
||||||
"for w in all_words:\n",
|
|
||||||
" lemmatized = lemmatize_mappings.get(w) or w\n",
|
|
||||||
" if lemmatized in seen_lemmatizations:\n",
|
|
||||||
" # The lemmatized version of this word was already seen\n",
|
|
||||||
" continue\n",
|
|
||||||
" else:\n",
|
|
||||||
" # The lemmatized version hasn't been seen. We're good to add it\n",
|
|
||||||
" final_wordlist.append([\n",
|
|
||||||
" k\n",
|
|
||||||
" for k\n",
|
|
||||||
" in lemmatize_mappings.keys()\n",
|
|
||||||
" if lemmatize_mappings[k] == lemmatized\n",
|
|
||||||
" ])\n",
|
|
||||||
" seen_lemmatizations.add(lemmatized)\n",
|
|
||||||
"\n",
|
|
||||||
" if len(final_wordlist) >= WORDLIST_SIZE:\n",
|
|
||||||
" break\n",
|
|
||||||
"\n",
|
|
||||||
"# Now, convert it to the format (number, word)\n",
|
|
||||||
"final_wordlist = [\n",
|
|
||||||
" (idx, w)\n",
|
|
||||||
" for idx, words in enumerate(final_wordlist)\n",
|
|
||||||
" for w in words\n",
|
|
||||||
"]"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": 12,
|
|
||||||
"id": "07c1293c",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": [
|
|
||||||
"print(len(lemmatize_mappings))"
|
|
||||||
]
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"cell_type": "code",
|
|
||||||
"execution_count": null,
|
|
||||||
"id": "19c255d0",
|
|
||||||
"metadata": {},
|
|
||||||
"outputs": [],
|
|
||||||
"source": []
|
|
||||||
}
|
|
||||||
],
|
|
||||||
"metadata": {
|
|
||||||
"kernelspec": {
|
|
||||||
"display_name": "Python 3 (ipykernel)",
|
|
||||||
"language": "python",
|
|
||||||
"name": "python3"
|
|
||||||
},
|
|
||||||
"language_info": {
|
|
||||||
"codemirror_mode": {
|
|
||||||
"name": "ipython",
|
|
||||||
"version": 3
|
|
||||||
},
|
|
||||||
"file_extension": ".py",
|
|
||||||
"mimetype": "text/x-python",
|
|
||||||
"name": "python",
|
|
||||||
"nbconvert_exporter": "python",
|
|
||||||
"pygments_lexer": "ipython3",
|
|
||||||
"version": "3.11.2"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"nbformat": 4,
|
|
||||||
"nbformat_minor": 5
|
|
||||||
}
|
|
@ -1,159 +0,0 @@
|
|||||||
#!/usr/bin/env python3
|
|
||||||
# coding: utf-8
|
|
||||||
|
|
||||||
print("Step 1")
|
|
||||||
|
|
||||||
|
|
||||||
try:
|
|
||||||
_initialized
|
|
||||||
except:
|
|
||||||
# !pip install spacy
|
|
||||||
# !python -m spacy download en_core_web_trf
|
|
||||||
import spacy
|
|
||||||
from tqdm import tqdm
|
|
||||||
|
|
||||||
nlp = spacy.load('en_core_web_trf', disable=['parser', 'ner'])
|
|
||||||
|
|
||||||
_initialized=True
|
|
||||||
|
|
||||||
import pandas as pd
|
|
||||||
import gzip
|
|
||||||
import re
|
|
||||||
|
|
||||||
|
|
||||||
print("Step 2")
|
|
||||||
|
|
||||||
|
|
||||||
def get_lines(filename):
|
|
||||||
with gzip.open(filename, 'r') as f:
|
|
||||||
ret = []
|
|
||||||
for l in f:
|
|
||||||
if len(ret) > 30_000:
|
|
||||||
return ret
|
|
||||||
ret.append(str(l).lower())
|
|
||||||
return ret
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
WORDLIST_SIZE = 8192 + 3
|
|
||||||
word_re = re.compile(r"^[A-Za-z]+$")
|
|
||||||
|
|
||||||
|
|
||||||
print("Step 3")
|
|
||||||
|
|
||||||
|
|
||||||
annotated_words=pd.read_excel("annotated_words.ods")
|
|
||||||
|
|
||||||
excluded_words = list(annotated_words[annotated_words["keep"] != "Yes"]["word"].str.lower())
|
|
||||||
excluded_words[0:10]
|
|
||||||
|
|
||||||
custom_maps = annotated_words[annotated_words["maps_to"].notna()][["word","maps_to"]].assign(maps_to=lambda x: x["maps_to"].map(lambda y: y.split(",")))
|
|
||||||
|
|
||||||
custom_maps = [
|
|
||||||
(m[1]["word"].lower(), mapping.lower())
|
|
||||||
for m in custom_maps.iterrows()
|
|
||||||
for mapping in m[1]["maps_to"]
|
|
||||||
]
|
|
||||||
custom_maps
|
|
||||||
|
|
||||||
|
|
||||||
print("Step 4")
|
|
||||||
|
|
||||||
|
|
||||||
# Start parsing the wordlist
|
|
||||||
all_words = get_lines("00-frequency-all.txt.gz")
|
|
||||||
|
|
||||||
# Delete header line
|
|
||||||
all_words = all_words[1:]
|
|
||||||
|
|
||||||
# Get only the word (fixed width)
|
|
||||||
all_words = [w[13:36].strip() for w in all_words]
|
|
||||||
|
|
||||||
# Remove special characters
|
|
||||||
all_words = [w for w in all_words if word_re.search(w)]
|
|
||||||
|
|
||||||
# Remove all removed words
|
|
||||||
all_words = [w for w in all_words if w not in excluded_words]
|
|
||||||
|
|
||||||
# Add all custom mappings
|
|
||||||
for m in list(sum(custom_maps, ())):
|
|
||||||
if m[0] not in all_words:
|
|
||||||
all_words.append(m[0])
|
|
||||||
if m[1] not in all_words:
|
|
||||||
all_words.append(m[1])
|
|
||||||
|
|
||||||
|
|
||||||
print("Step 5")
|
|
||||||
|
|
||||||
|
|
||||||
# Lemmatize all words (plural -> singular)
|
|
||||||
lemmatize_mappings = [(w, nlp(w)[0].lemma_) for w in tqdm(all_words)]
|
|
||||||
print(lemmatize_mappings[:100])
|
|
||||||
|
|
||||||
# Add custom lemmatizations
|
|
||||||
for l in custom_maps:
|
|
||||||
if l in lemmatize_mappings:
|
|
||||||
print(f"Warning: {l} is already lemmatized")
|
|
||||||
else:
|
|
||||||
lemmatize_mappings.append(l)
|
|
||||||
|
|
||||||
print(lemmatize_mappings[:100])
|
|
||||||
|
|
||||||
lemmatize_mappings = [w for w in lemmatize_mappings if w[1] not in excluded_words]
|
|
||||||
print(lemmatize_mappings[:100])
|
|
||||||
|
|
||||||
# Now, re-add all lematized words to the list of every word
|
|
||||||
for w in sum(lemmatize_mappings, ()):
|
|
||||||
if w not in all_words:
|
|
||||||
print(w)
|
|
||||||
all_words.append(w)
|
|
||||||
|
|
||||||
lemmatize_mappings = {k: v for k, v in lemmatize_mappings}
|
|
||||||
|
|
||||||
|
|
||||||
print("Step 6")
|
|
||||||
|
|
||||||
|
|
||||||
final_wordlist = []
|
|
||||||
seen_lemmatizations = set()
|
|
||||||
for w in all_words:
|
|
||||||
lemmatized = lemmatize_mappings.get(w) or w
|
|
||||||
if lemmatized in seen_lemmatizations:
|
|
||||||
# The lemmatized version of this word was already seen
|
|
||||||
continue
|
|
||||||
else:
|
|
||||||
# The lemmatized version hasn't been seen. We're good to add it
|
|
||||||
final_wordlist.append([
|
|
||||||
k
|
|
||||||
for k
|
|
||||||
in lemmatize_mappings.keys()
|
|
||||||
if lemmatize_mappings[k] == lemmatized
|
|
||||||
])
|
|
||||||
seen_lemmatizations.add(lemmatized)
|
|
||||||
|
|
||||||
if len(final_wordlist) >= WORDLIST_SIZE:
|
|
||||||
break
|
|
||||||
|
|
||||||
# Now, convert it to the format (number, word)
|
|
||||||
final_wordlist = [
|
|
||||||
(idx, w)
|
|
||||||
for idx, words in enumerate(final_wordlist)
|
|
||||||
for w in words
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
print("Step 7")
|
|
||||||
|
|
||||||
print(len(lemmatize_mappings))
|
|
||||||
|
|
||||||
print("Step 8")
|
|
||||||
|
|
||||||
with open("01-generated-wordlist.csv", "w") as f:
|
|
||||||
f.write("word,number\n")
|
|
||||||
|
|
||||||
for w in final_wordlist:
|
|
||||||
lemmatized = "" if not w[1] else w[1]
|
|
||||||
f.write(f"{w[1].upper()},{w[0]}")
|
|
||||||
f.write("\n")
|
|
||||||
|
|
||||||
print("Done")
|
|
Loading…
Reference in New Issue
Block a user