Use new wordlist generation logic
This commit is contained in:
parent
3ac59f35ed
commit
71404b8c6e
BIN
wordlist/00-frequency-list.csv.gz
Normal file
BIN
wordlist/00-frequency-list.csv.gz
Normal file
Binary file not shown.
501
wordlist/01-errored-lemmatized-words.csv
Normal file
501
wordlist/01-errored-lemmatized-words.csv
Normal file
@ -0,0 +1,501 @@
|
||||
WORD,ATTEMPTED_LEMMATIZATION,LEMMATIZER
|
||||
THEMSELVES,THEMSELVE,SPACY
|
||||
PERHAPS,PERHAP,SPACY
|
||||
SERIES,SERIE,SPACY
|
||||
OURSELVES,OURSELVE,SPACY
|
||||
EXCEED,EXCEE,SPACY
|
||||
BLEED,BLEE,SPACY
|
||||
MATHEMATICS,MATHEMATIC,SPACY
|
||||
NAKED,NAKE,SPACY
|
||||
SKILLED,SKILLE,SPACY
|
||||
BELOVED,BELOVE,SPACY
|
||||
LEST,L,SPACY
|
||||
WICKED,WICKE,SPACY
|
||||
EMBED,EMBE,SPACY
|
||||
DIABETES,DIABETE,SPACY
|
||||
ONGOING,ONGOE,SPACY
|
||||
ASHAMED,ASHAME,SPACY
|
||||
CREED,CREE,SPACY
|
||||
SINNER,SINN,SPACY
|
||||
INDEBTED,INDEBTE,SPACY
|
||||
UNCHANGED,UNCHANGE,SPACY
|
||||
UNPUBLISHED,UNPUBLISHE,SPACY
|
||||
UNEMPLOYED,UNEMPLOYE,SPACY
|
||||
FORTHCOMING,FORTHCOME,SPACY
|
||||
METAPHYSICS,METAPHYSIC,SPACY
|
||||
TROUSERS,TROUSER,SPACY
|
||||
UNAFFECTED,UNAFFECTE,SPACY
|
||||
RENOWNED,RENOWNE,SPACY
|
||||
TALENTED,TALENTE,SPACY
|
||||
GREED,GREE,SPACY
|
||||
UNFINISHED,UNFINISHE,SPACY
|
||||
AESTHETICS,AESTHETIC,SPACY
|
||||
INFRARED,INFRARE,SPACY
|
||||
DISINTERESTED,DISINTERESTE,SPACY
|
||||
UNNOTICED,UNNOTICE,SPACY
|
||||
TING,TE,SPACY
|
||||
ANNALS,ANNAL,SPACY
|
||||
OUTSKIRTS,OUTSKIRT,SPACY
|
||||
DETEST,DET,SPACY
|
||||
FETTER,FETT,SPACY
|
||||
SIDEWAYS,SIDEWAY,SPACY
|
||||
ALMS,ALM,SPACY
|
||||
MEASLES,MEASLE,SPACY
|
||||
UNRESTRICTED,UNRESTRICTE,SPACY
|
||||
ARREARS,ARREAR,SPACY
|
||||
UNDEVELOPED,UNDEVELOPE,SPACY
|
||||
CARIES,CARIE,SPACY
|
||||
MORES,MORE,SPACY
|
||||
UNALTERED,UNALTERE,SPACY
|
||||
UNPROTECTED,UNPROTECTE,SPACY
|
||||
UNEDUCATED,UNEDUCATE,SPACY
|
||||
GALLOWS,GALLOW,SPACY
|
||||
UNDATED,UNDATE,SPACY
|
||||
UNNAMED,UNNAME,SPACY
|
||||
MONIES,MONIE,SPACY
|
||||
UNAIDED,UNAIDE,SPACY
|
||||
UNQUESTIONED,UNQUESTIONE,SPACY
|
||||
IMBED,IMBE,SPACY
|
||||
AMINES,AMINE,SPACY
|
||||
GRASSROOTS,GRASSROOT,SPACY
|
||||
ACCURSED,ACCURSE,SPACY
|
||||
UNDECIDED,UNDECIDE,SPACY
|
||||
UNCHECKED,UNCHECKE,SPACY
|
||||
UNCOMPLICATED,UNCOMPLICATE,SPACY
|
||||
BELATED,BELATE,SPACY
|
||||
UNDETERMINED,UNDETERMINE,SPACY
|
||||
EAVES,EAVE,SPACY
|
||||
DISAFFECTED,DISAFFECT,SPACY
|
||||
UNFULFILLED,UNFULFILLE,SPACY
|
||||
STRIATED,STRIATE,SPACY
|
||||
RICKETS,RICKET,SPACY
|
||||
BICEPS,BICEP,SPACY
|
||||
DEATHBED,DEATHBE,SPACY
|
||||
RABIES,RABIE,SPACY
|
||||
UNATTENDED,UNATTENDE,SPACY
|
||||
UNABATED,UNABATE,SPACY
|
||||
MANNERED,MANNERE,SPACY
|
||||
FAECES,FAECE,SPACY
|
||||
QUADRUPED,QUADRUPE,SPACY
|
||||
UNSTRUCTURED,UNSTRUCTURE,SPACY
|
||||
UNAVAILING,UNAVAILE,SPACY
|
||||
SUBSPECIES,SUBSPECIE,SPACY
|
||||
UNDETECTED,UNDETECTE,SPACY
|
||||
UNPLANNED,UNPLANNE,SPACY
|
||||
UNCONDITIONED,UNCONDITIONE,SPACY
|
||||
INBRED,INBREED,SPACY
|
||||
TONGS,TONG,SPACY
|
||||
BIGOTED,BIGOTE,SPACY
|
||||
ENTRAILS,ENTRAIL,SPACY
|
||||
UNEQUALLED,UNEQUALLE,SPACY
|
||||
ONCOMING,ONCOME,SPACY
|
||||
UNTHINKING,UNTHINKE,SPACY
|
||||
MENSES,MENSE,SPACY
|
||||
UNMITIGATED,UNMITIGATE,SPACY
|
||||
UNRECORDED,UNRECORDE,SPACY
|
||||
SEMIOTICS,SEMIOTIC,SPACY
|
||||
UNACKNOWLEDGED,UNACKNOWLEDGE,SPACY
|
||||
UNDISGUISED,UNDISGUISE,SPACY
|
||||
PYRITES,PYRITE,SPACY
|
||||
UNOBSTRUCTED,UNOBSTRUCTE,SPACY
|
||||
UNATTACHED,UNATTACHE,SPACY
|
||||
UNMATCHED,UNMATCHE,SPACY
|
||||
PLIERS,PLIER,SPACY
|
||||
ENAMORED,ENAMOR,SPACY
|
||||
PANTIES,PANTIE,SPACY
|
||||
BACKWOODS,BACKWOOD,SPACY
|
||||
UNPROVOKED,UNPROVOKE,SPACY
|
||||
TRICEPS,TRICEP,SPACY
|
||||
UNCHARTED,UNCHARTE,SPACY
|
||||
MALNOURISHED,MALNOURISH,SPACY
|
||||
MONEYED,MONEYE,SPACY
|
||||
UNTAMED,UNTAME,SPACY
|
||||
METHYLATED,METHYLATE,SPACY
|
||||
UNRELIEVED,UNRELIEVE,SPACY
|
||||
UNLETTERED,UNLETTERE,SPACY
|
||||
HOTBED,HOTBE,SPACY
|
||||
UNIMPROVED,UNIMPROVE,SPACY
|
||||
LOPSIDED,LOPSIDE,SPACY
|
||||
LEGGING,LEGGE,SPACY
|
||||
UNFEIGNED,UNFEIGNE,SPACY
|
||||
UNTESTED,UNTESTE,SPACY
|
||||
UNBLEMISHED,UNBLEMISHE,SPACY
|
||||
TECHNICS,TECHNIC,SPACY
|
||||
UNTAINTED,UNTAINTE,SPACY
|
||||
UNREGISTERED,UNREGISTERE,SPACY
|
||||
UNFORMED,UNFORME,SPACY
|
||||
OVERWEENING,OVERWEENE,SPACY
|
||||
UNPROVIDED,UNPROVIDE,SPACY
|
||||
FOOTLIGHTS,FOOTLIGHT,SPACY
|
||||
UNCONVERTED,UNCONVERTE,SPACY
|
||||
OBSEQUIES,OBSEQUIE,SPACY
|
||||
PINCERS,PINCER,SPACY
|
||||
MALADJUSTED,MALADJUSTE,SPACY
|
||||
ISOSCELES,ISOSCELE,SPACY
|
||||
UNPROVED,UNPROVE,SPACY
|
||||
AMIDSHIPS,AMIDSHIP,SPACY
|
||||
SEMISKILLED,SEMISKILLE,SPACY
|
||||
UNDIRECTED,UNDIRECTE,SPACY
|
||||
TABES,TABE,SPACY
|
||||
UNCLAIMED,UNCLAIME,SPACY
|
||||
UNPOLISHED,UNPOLISHE,SPACY
|
||||
FARSIGHTED,FARSIGHTE,SPACY
|
||||
FAUCES,FAUCE,SPACY
|
||||
NONCOMMISSIONED,NONCOMMISSIONE,SPACY
|
||||
UNCHARGED,UNCHARGE,SPACY
|
||||
CONGERIES,CONGERIE,SPACY
|
||||
SCABIES,SCABIE,SPACY
|
||||
MALFORMED,MALFORME,SPACY
|
||||
INFORMATICS,INFORMATIC,SPACY
|
||||
INCREMENTED,INCREMENTE,SPACY
|
||||
UNDISTRIBUTED,UNDISTRIBUTE,SPACY
|
||||
HYDRODYNAMICS,HYDRODYNAMIC,SPACY
|
||||
ANTIPODES,ANTIPODE,SPACY
|
||||
UNDEREMPLOYED,UNDEREMPLOYE,SPACY
|
||||
BIPED,BIPE,SPACY
|
||||
ELECTRODYNAMICS,ELECTRODYNAMIC,SPACY
|
||||
FEEBLEMINDED,FEEBLEMINDE,SPACY
|
||||
SUDS,SUD,SPACY
|
||||
UNDERSIZED,UNDERSIZE,SPACY
|
||||
HUSTINGS,HUSTING,SPACY
|
||||
STOREYED,STOREYE,SPACY
|
||||
UNREFINED,UNREFINE,SPACY
|
||||
UNTURNED,UNTURNE,SPACY
|
||||
SERRIED,SERRIE,SPACY
|
||||
DOLDRUMS,DOLDRUM,SPACY
|
||||
STATS,STAT,SPACY
|
||||
GULES,GULE,SPACY
|
||||
UNDERPANTS,UNDERPANT,SPACY
|
||||
UNREWARDING,UNREWARDE,SPACY
|
||||
CALIPERS,CALIPER,SPACY
|
||||
ONESIDED,ONESIDE,SPACY
|
||||
UNABRIDGED,UNABRIDGE,SPACY
|
||||
UNFURNISHED,UNFURNISHE,SPACY
|
||||
UNREDEEMED,UNREDEEME,SPACY
|
||||
UNSEEING,UNSEEE,SPACY
|
||||
KNICKERS,KNICKER,SPACY
|
||||
UNLISTED,UNLISTE,SPACY
|
||||
INNARDS,INNARD,SPACY
|
||||
GLANDERS,GLANDER,SPACY
|
||||
OVEREXTENDED,OVEREXTEND,SPACY
|
||||
RELATIVIZED,RELATIVIZE,SPACY
|
||||
UNFUNDED,UNFUNDE,SPACY
|
||||
COLUMNED,COLUMNE,SPACY
|
||||
CALISTHENICS,CALISTHENIC,SPACY
|
||||
SUPERFICIES,SUPERFICIE,SPACY
|
||||
CASTELLATED,CASTELLATE,SPACY
|
||||
PUREBRED,PUREBRE,SPACY
|
||||
CORTES,CORTE,SPACY
|
||||
REDHEADED,REDHEADE,SPACY
|
||||
CASTANETS,CASTANET,SPACY
|
||||
UNPRACTISED,UNPRACTISE,SPACY
|
||||
UNEDITED,UNEDITE,SPACY
|
||||
LIVERIED,LIVERIE,SPACY
|
||||
NOSEBLEED,NOSEBLEE,SPACY
|
||||
UNDERFUNDED,UNDERFUNDE,SPACY
|
||||
UNGRADED,UNGRADE,SPACY
|
||||
UNREDUCED,UNREDUCE,SPACY
|
||||
SIDEBURNS,SIDEBURN,SPACY
|
||||
SICKBED,SICKBE,SPACY
|
||||
FASCES,FASCE,SPACY
|
||||
AVIONICS,AVIONIC,SPACY
|
||||
CRENELATED,CRENELATE,SPACY
|
||||
NATES,NATE,SPACY
|
||||
UNREGARDED,UNREGARDE,SPACY
|
||||
UNRECONSTRUCTED,UNRECONSTRUCTE,SPACY
|
||||
BRITCHES,BRITCHE,SPACY
|
||||
DEDANS,DEDAN,SPACY
|
||||
PARATROOPS,PARATROOP,SPACY
|
||||
FINEGRAINED,FINEGRAINE,SPACY
|
||||
GREAVES,GREAVE,SPACY
|
||||
SUBSCRIPTED,SUBSCRIPTE,SPACY
|
||||
LUES,LUE,SPACY
|
||||
BIOMETRICS,BIOMETRIC,SPACY
|
||||
ARIES,ARIE,SPACY
|
||||
GASWORKS,GASWORK,SPACY
|
||||
BULLETED,BULLETE,SPACY
|
||||
HEARTSTRINGS,HEARTSTRING,SPACY
|
||||
INCREMENTING,INCREMENTE,SPACY
|
||||
UNCLEARED,UNCLEARE,SPACY
|
||||
CONSOLS,CONSOL,SPACY
|
||||
MUDFLATS,MUDFLAT,SPACY
|
||||
BADLANDS,BADLAND,SPACY
|
||||
TALIPES,TALIPE,SPACY
|
||||
LANCINATING,LANCINATE,SPACY
|
||||
UNACCOMPLISHED,UNACCOMPLISHE,SPACY
|
||||
TESSELLATED,TESSELLATE,SPACY
|
||||
SEMIFINISHED,SEMIFINISHE,SPACY
|
||||
UNAVENGED,UNAVENGE,SPACY
|
||||
SOAPSUDS,SOAPSUD,SPACY
|
||||
YEATS,YEAT,SPACY
|
||||
TELEMATICS,TELEMATIC,SPACY
|
||||
UNNOTED,UNNOTE,SPACY
|
||||
MEALIES,MEALIE,SPACY
|
||||
PARIES,PARIE,SPACY
|
||||
AUROCHS,AUROCH,SPACY
|
||||
AGOING,AGOE,SPACY
|
||||
ODDMENTS,ODDMENT,SPACY
|
||||
CLEARHEADED,CLEARHEADE,SPACY
|
||||
REDOUBTED,REDOUBTE,SPACY
|
||||
IVIED,IVIE,SPACY
|
||||
PINNIPED,PINNIPE,SPACY
|
||||
DEFATTED,DEFATTE,SPACY
|
||||
DECAFFEINATED,DECAFFEINATE,SPACY
|
||||
NINEPINS,NINEPIN,SPACY
|
||||
CAMPHORATED,CAMPHORATE,SPACY
|
||||
GLASSWORKS,GLASSWORK,SPACY
|
||||
SORITES,SORITE,SPACY
|
||||
AFFOREST,AFFOR,SPACY
|
||||
DISSAVING,DISSAVE,SPACY
|
||||
UNADVISED,UNADVISE,SPACY
|
||||
UNRECLAIMED,UNRECLAIME,SPACY
|
||||
LARES,LARE,SPACY
|
||||
LEVELHEADED,LEVELHEADE,SPACY
|
||||
SWEATPANTS,SWEATPANT,SPACY
|
||||
LOTOS,LOTO,SPACY
|
||||
GIBLETS,GIBLET,SPACY
|
||||
UNBOWED,UNBOWE,SPACY
|
||||
UNPROMPTED,UNPROMPTE,SPACY
|
||||
ABSCESSED,ABSCESSE,SPACY
|
||||
NODULATED,NODULATE,SPACY
|
||||
RUBENS,RUBEN,SPACY
|
||||
UNPAGED,UNPAGE,SPACY
|
||||
CALENDS,CALEND,SPACY
|
||||
TRUNKED,TRUNKE,SPACY
|
||||
TROUSERED,TROUSERE,SPACY
|
||||
PENATES,PENATE,SPACY
|
||||
COMBINATORICS,COMBINATORIC,SPACY
|
||||
TRESSED,TRESSE,SPACY
|
||||
PARTICOLOURED,PARTICOLOURE,SPACY
|
||||
UNENCRYPTED,UNENCRYPTE,SPACY
|
||||
ASTRONAUTICS,ASTRONAUTIC,SPACY
|
||||
HYDROPONICS,HYDROPONIC,SPACY
|
||||
UNFORMATTED,UNFORMATTE,SPACY
|
||||
SEMIDETACHED,SEMIDETACHE,SPACY
|
||||
BONKERS,BONKER,SPACY
|
||||
UNDIES,UNDIE,SPACY
|
||||
EPS,EP,SPACY
|
||||
GIMBALS,GIMBAL,SPACY
|
||||
BALCONIED,BALCONIE,SPACY
|
||||
SALTWORKS,SALTWORK,SPACY
|
||||
UNPLEDGED,UNPLEDGE,SPACY
|
||||
PREDESIGNED,PREDESIGNE,SPACY
|
||||
NFS,NF,SPACY
|
||||
UNDERBRED,UNDERBRE,SPACY
|
||||
PRECOMPILED,PRECOMPILE,SPACY
|
||||
KALENDS,KALEND,SPACY
|
||||
LITOTES,LITOTE,SPACY
|
||||
INDIGESTED,INDIGESTE,SPACY
|
||||
CITS,CIT,SPACY
|
||||
UNPRICED,UNPRICE,SPACY
|
||||
PINCHERS,PINCHER,SPACY
|
||||
CANCELLATED,CANCELLATE,SPACY
|
||||
CHITTERLINGS,CHITTERLING,SPACY
|
||||
DIBS,DIB,SPACY
|
||||
RIGHTWARDS,RIGHTWARD,SPACY
|
||||
CONVENANCES,CONVENANCE,SPACY
|
||||
INTERALLIED,INTERALLIE,SPACY
|
||||
FLINDERS,FLINDER,SPACY
|
||||
CRANNIED,CRANNIE,SPACY
|
||||
HOMEBRED,HOMEBRE,SPACY
|
||||
HIGHBRED,HIGHBRE,SPACY
|
||||
UNRULED,UNRULE,SPACY
|
||||
FOREHANDED,FOREHANDE,SPACY
|
||||
PREPACKED,PREPACKE,SPACY
|
||||
UNWISHED,UNWISHE,SPACY
|
||||
ENTREMETS,ENTREMET,SPACY
|
||||
ESTOVERS,ESTOVER,SPACY
|
||||
ANGELES,ANGELE,SPACY
|
||||
DAISIED,DAISIE,SPACY
|
||||
UPRATED,UPRATE,SPACY
|
||||
THIGHED,THIGHE,SPACY
|
||||
TURPS,TURP,SPACY
|
||||
WEAZENED,WEAZENE,SPACY
|
||||
EFFING,EFF,SPACY
|
||||
HOLS,HOL,SPACY
|
||||
JIGGERED,JIGGERE,SPACY
|
||||
SOCRATES,SOCRATE,SPACY
|
||||
AUDITORIES,AUDITORIE,SPACY
|
||||
AMBAGES,AMBAGE,SPACY
|
||||
DOITED,DOITE,SPACY
|
||||
BIONICS,BIONIC,SPACY
|
||||
UNREFERENCED,UNREFERENCE,SPACY
|
||||
EXEQUIES,EXEQUIE,SPACY
|
||||
CERASTES,CERASTE,SPACY
|
||||
SEMIMANUFACTURES,SEMIMANUFACTURE,SPACY
|
||||
GALLUSES,GALLUS,SPACY
|
||||
RERECORDED,RERECORDE,SPACY
|
||||
TELESALES,TELESALE,SPACY
|
||||
MICROGRAPHICS,MICROGRAPHIC,SPACY
|
||||
SIEMENS,SIEMEN,SPACY
|
||||
ZOUNDS,ZOUND,SPACY
|
||||
SEMIFIXED,SEMIFIXE,SPACY
|
||||
UNDIVERTED,UNDIVERTE,SPACY
|
||||
SANIES,SANIE,SPACY
|
||||
BREECHING,BREECHE,SPACY
|
||||
MENTHOLATED,MENTHOLATE,SPACY
|
||||
PANTALETS,PANTALET,SPACY
|
||||
CRUDITES,CRUDITE,SPACY
|
||||
TRAPES,TRAPE,SPACY
|
||||
PIXILATED,PIXILATE,SPACY
|
||||
BOOTES,BOOTE,SPACY
|
||||
UNPOSTED,UNPOSTE,SPACY
|
||||
HANTS,HANT,SPACY
|
||||
UNDETAILED,UNDETAILE,SPACY
|
||||
HAVINGS,HAVING,SPACY
|
||||
OUTGIVING,OUTGIVE,SPACY
|
||||
UNCOMPLEMENTED,UNCOMPLEMENTE,SPACY
|
||||
PRATIES,PRATIE,SPACY
|
||||
ELEVENSES,ELEVENSE,SPACY
|
||||
UNENLIVENED,UNENLIVENE,SPACY
|
||||
NANTES,NANTE,SPACY
|
||||
AFFINED,AFFINE,SPACY
|
||||
NONNESTED,NONNESTE,SPACY
|
||||
FALLOWING,FALLOWE,SPACY
|
||||
HYDROMECHANICS,HYDROMECHANIC,SPACY
|
||||
CLIVERS,CLIVER,SPACY
|
||||
UNICES,UNICE,SPACY
|
||||
GRAMMATICS,GRAMMATIC,SPACY
|
||||
PRAPS,PRAP,SPACY
|
||||
INTERWORKING,INTERWORKE,SPACY
|
||||
HERCULES,HERCULE,SPACY
|
||||
BIGHEADED,BIGHEADE,SPACY
|
||||
KIES,KY,SPACY
|
||||
NETHERLANDS,NETHERLAND,SPACY
|
||||
UNBOOKED,UNBOOKE,SPACY
|
||||
QUINS,QUIN,SPACY
|
||||
CANNES,CANNE,SPACY
|
||||
UNNURTURED,UNNURTURE,SPACY
|
||||
WEDGIES,WEDGIE,SPACY
|
||||
HANDWORKED,HANDWORKE,SPACY
|
||||
ANALECTS,ANALECT,SPACY
|
||||
HERTS,HERT,SPACY
|
||||
ORLEANS,ORLEAN,SPACY
|
||||
PESCADORES,PESCADORE,SPACY
|
||||
ULCERED,ULCERE,SPACY
|
||||
MISCREATED,MISCREATE,SPACY
|
||||
UNPRIZED,UNPRIZE,SPACY
|
||||
SLYBOOTS,SLYBOOT,SPACY
|
||||
RUNTED,RUNTE,SPACY
|
||||
REATTRIBUTED,REATTRIBUTE,SPACY
|
||||
HOUSETRAINED,HOUSETRAINE,SPACY
|
||||
SOBERSIDES,SOBERSIDE,SPACY
|
||||
COLESEED,COLESEE,SPACY
|
||||
BLUCHERS,BLUCHER,SPACY
|
||||
MUGGINS,MUGGIN,SPACY
|
||||
UNCRIPPLED,UNCRIPPLE,SPACY
|
||||
HEPPED,HEPPE,SPACY
|
||||
WITHINDOORS,WITHINDOOR,SPACY
|
||||
BEESTINGS,BEESTING,SPACY
|
||||
FLANDERS,FLANDER,SPACY
|
||||
DIOGENES,DIOGENE,SPACY
|
||||
COSMONAUTICS,COSMONAUTIC,SPACY
|
||||
WHOLEGRAINS,WHOLEGRAIN,SPACY
|
||||
NEEDMENTS,NEEDMENT,SPACY
|
||||
ACHATES,ACHATE,SPACY
|
||||
PRECOMPILING,PRECOMPILE,SPACY
|
||||
BALUSTERED,BALUSTERE,SPACY
|
||||
JUGGINS,JUGGIN,SPACY
|
||||
UNCONFIGURED,UNCONFIGURE,SPACY
|
||||
SLUGABED,SLUGABE,SPACY
|
||||
CHARGRILLED,CHARGRILLE,SPACY
|
||||
GANGES,GANGE,SPACY
|
||||
FLATWAYS,FLATWAY,SPACY
|
||||
CHAMPERS,CHAMPER,SPACY
|
||||
GOLDILOCKS,GOLDILOCK,SPACY
|
||||
REIMS,REIM,SPACY
|
||||
REIMPORTING,REIMPORTE,SPACY
|
||||
EMOTIONED,EMOTIONE,SPACY
|
||||
AIRBED,AIRBE,SPACY
|
||||
GIGAFLOPS,GIGAFLOP,SPACY
|
||||
YONKS,YONK,SPACY
|
||||
CASALS,CASAL,SPACY
|
||||
ROCKIES,ROCKIE,SPACY
|
||||
ORESTES,ORESTE,SPACY
|
||||
REMAPPING,REMAPPE,SPACY
|
||||
EBONICS,EBONIC,SPACY
|
||||
BRUGES,BRUGE,SPACY
|
||||
JANKERS,JANKER,SPACY
|
||||
NOTTS,NOTT,SPACY
|
||||
PROCRUSTES,PROCRUSTE,SPACY
|
||||
MULTISCALED,MULTISCALE,SPACY
|
||||
AGROTECHNICS,AGROTECHNIC,SPACY
|
||||
WAYGOING,WAYGOE,SPACY
|
||||
GENDERING,GENDERE,SPACY
|
||||
TELEMECHANICS,TELEMECHANIC,SPACY
|
||||
DEGATING,DEGATE,SPACY
|
||||
THAMES,THAME,SPACY
|
||||
LOWLIVED,LOWLIVE,SPACY
|
||||
REEDING,REEDE,SPACY
|
||||
INTERCROSSING,INTERCROSSE,SPACY
|
||||
UNDEDUCTED,UNDEDUCTE,SPACY
|
||||
AGOGICS,AGOGIC,SPACY
|
||||
UNATTENDING,UNATTENDE,SPACY
|
||||
OVERMASTED,OVERMASTE,SPACY
|
||||
GILES,GILE,SPACY
|
||||
NONCOPYRIGHTED,NONCOPYRIGHTE,SPACY
|
||||
LUDDITES,LUDDITE,SPACY
|
||||
SCURVIED,SCURVIE,SPACY
|
||||
REBREAKING,REBREAKE,SPACY
|
||||
KEATS,KEAT,SPACY
|
||||
CERVANTES,CERVANTE,SPACY
|
||||
UNCONDONED,UNCONDONE,SPACY
|
||||
DESCARTES,DESCARTE,SPACY
|
||||
BEJABERS,BEJABER,SPACY
|
||||
VIDEOGRAPHICS,VIDEOGRAPHIC,SPACY
|
||||
EURIPIDES,EURIPIDE,SPACY
|
||||
UNPERJURED,UNPERJURE,SPACY
|
||||
LAERTES,LAERTE,SPACY
|
||||
OVERCOLLECTED,OVERCOLLECTE,SPACY
|
||||
AMPHIBRACHYS,AMPHIBRACHY,SPACY
|
||||
CHEOPS,CHEOP,SPACY
|
||||
CHALONS,CHALON,SPACY
|
||||
VERSICOLOURED,VERSICOLOURE,SPACY
|
||||
SUBPARTITIONED,SUBPARTITIONE,SPACY
|
||||
BALBUTIES,BALBUTIE,SPACY
|
||||
ARCHIMEDES,ARCHIMEDE,SPACY
|
||||
GATELEGGED,GATELEGGE,SPACY
|
||||
POITIERS,POITIER,SPACY
|
||||
HAVERING,HAVERE,SPACY
|
||||
THEBES,THEBE,SPACY
|
||||
SEVRES,SEVRE,SPACY
|
||||
PERICLES,PERICLE,SPACY
|
||||
LIMOGES,LIMOGE,SPACY
|
||||
EVENTING,EVENTE,SPACY
|
||||
FATBITS,FATBIT,SPACY
|
||||
HUTTING,HUTTE,SPACY
|
||||
DOGSHORES,DOGSHORE,SPACY
|
||||
OVERBADING,OVERBADE,SPACY
|
||||
AZORES,AZORE,SPACY
|
||||
BLEWITS,BLEWIT,SPACY
|
||||
HIPOCRATES,HIPOCRATE,SPACY
|
||||
AMIENS,AMIEN,SPACY
|
||||
GUTTING,GUTTE,SPACY
|
||||
GLADYS,GLADY,SPACY
|
||||
CHADDED,CHADDE,SPACY
|
||||
EUPHRATES,EUPHRATE,SPACY
|
||||
TROWING,TROWE,SPACY
|
||||
LACEUPS,LACEUP,SPACY
|
||||
ALIPED,ALIPE,SPACY
|
||||
TALIPED,TALIPE,SPACY
|
||||
RAMSES,RAMSE,SPACY
|
||||
CENTRONICS,CENTRONIC,SPACY
|
||||
BANTING,BANTE,SPACY
|
||||
TELEPHOTOLENS,TELEPHOTOLEN,SPACY
|
||||
ARAKS,ARAK,SPACY
|
||||
DONETS,DONET,SPACY
|
||||
CEROPLASTICS,CEROPLASTIC,SPACY
|
||||
BAYNETWORKS,BAYNETWORK,SPACY
|
||||
NORWARDS,NORWARD,SPACY
|
||||
HAPPING,HAPPE,SPACY
|
||||
BARENTS,BARENT,SPACY
|
||||
ABLINGS,ABLING,SPACY
|
||||
CELLING,CELLE,SPACY
|
||||
CELEBES,CELEBE,SPACY
|
||||
NENETS,NENET,SPACY
|
||||
IMPING,IMPE,SPACY
|
||||
LINARES,LINARE,SPACY
|
||||
VAILING,VAILE,SPACY
|
||||
HABDABS,HABDAB,SPACY
|
||||
RELISTING,RELISTE,SPACY
|
||||
HOUGHING,HOUGHE,SPACY
|
|
@ -1,122 +0,0 @@
|
||||
word,lemmatized_word
|
||||
the,THE
|
||||
of,OF
|
||||
to,TO
|
||||
in,IN
|
||||
is,BE
|
||||
that,THAT
|
||||
for,FOR
|
||||
be,BE
|
||||
by,BY
|
||||
with,WITH
|
||||
on,ON
|
||||
not,NOT
|
||||
this,THIS
|
||||
are,BE
|
||||
at,AT
|
||||
from,FROM
|
||||
he,HE
|
||||
which,WHICH
|
||||
his,HIS
|
||||
have,HAVE
|
||||
an,AN
|
||||
but,BUT
|
||||
you,YOU
|
||||
they,THEY
|
||||
were,BE
|
||||
had,HAVE
|
||||
we,WE
|
||||
all,ALL
|
||||
one,ONE
|
||||
their,THEIR
|
||||
been,BE
|
||||
will,WILL
|
||||
there,THERE
|
||||
can,CAN
|
||||
if,IF
|
||||
other,OTHER
|
||||
would,WOULD
|
||||
no,NO
|
||||
her,SHE
|
||||
may,MAY
|
||||
more,MORE
|
||||
when,WHEN
|
||||
who,WHO
|
||||
such,SUCH
|
||||
these,THESE
|
||||
any,ANY
|
||||
she,SHE
|
||||
new,NEW
|
||||
time,TIME
|
||||
than,THAN
|
||||
do,DO
|
||||
some,SOME
|
||||
what,WHAT
|
||||
only,ONLY
|
||||
into,INTO
|
||||
them,THEY
|
||||
two,TWO
|
||||
also,ALSO
|
||||
about,ABOUT
|
||||
out,OUT
|
||||
him,HE
|
||||
my,MY
|
||||
said,SAY
|
||||
up,UP
|
||||
our,OUR
|
||||
first,FIRST
|
||||
should,SHOULD
|
||||
under,UNDER
|
||||
made,MAKE
|
||||
state,STATE
|
||||
see,SEE
|
||||
after,AFTER
|
||||
could,COULD
|
||||
then,THEN
|
||||
me,I
|
||||
most,MOST
|
||||
over,OVER
|
||||
very,VERY
|
||||
your,YOUR
|
||||
between,BETWEEN
|
||||
where,WHERE
|
||||
now,NOW
|
||||
shall,SHALL
|
||||
work,WORK
|
||||
those,THOSE
|
||||
same,SAME
|
||||
well,WELL
|
||||
each,EACH
|
||||
many,MANY
|
||||
being,BE
|
||||
years,YEAR
|
||||
did,DO
|
||||
year,YEAR
|
||||
through,THROUGH
|
||||
must,MUST
|
||||
upon,UPON
|
||||
before,BEFORE
|
||||
like,LIKE
|
||||
use,USE
|
||||
part,PART
|
||||
general,GENERAL
|
||||
people,PEOPLE
|
||||
because,BECAUSE
|
||||
used,USE
|
||||
how,HOW
|
||||
even,EVEN
|
||||
much,MUCH
|
||||
states,STATE
|
||||
during,DURING
|
||||
both,BOTH
|
||||
case,CASE
|
||||
three,THREE
|
||||
number,NUMBER
|
||||
make,MAKE
|
||||
per,PER
|
||||
great,GREAT
|
||||
act,ACT
|
||||
way,WAY
|
||||
life,LIFE
|
||||
good,GOOD
|
||||
day,DAY
|
|
BIN
wordlist/01-lemmatized-words.csv.gz
Normal file
BIN
wordlist/01-lemmatized-words.csv.gz
Normal file
Binary file not shown.
@ -1,103 +1,70 @@
|
||||
#!/usr/bin/env python3
|
||||
# coding: utf-8
|
||||
|
||||
print("Step 1")
|
||||
print("Loading dependencies")
|
||||
|
||||
|
||||
try:
|
||||
_initialized
|
||||
except:
|
||||
# !pip install spacy
|
||||
# !python -m spacy download en_core_web_trf
|
||||
import spacy
|
||||
from tqdm import tqdm
|
||||
|
||||
nlp = spacy.load('en_core_web_trf', disable=['parser', 'ner'])
|
||||
|
||||
_initialized=True
|
||||
|
||||
import pandas as pd
|
||||
import spacy
|
||||
import nltk
|
||||
from tqdm import tqdm
|
||||
import gzip
|
||||
import re
|
||||
|
||||
# Wordnet
|
||||
try:
|
||||
from nltk.stem.wordnet import WordNetLemmatizer
|
||||
except:
|
||||
nltk.download("wordnet")
|
||||
from nltk.stem.wordnet import WordNetLemmatizer
|
||||
wordnet = WordNetLemmatizer()
|
||||
|
||||
print("Step 2")
|
||||
# Spacy
|
||||
nlp = spacy.load('en_core_web_trf', disable=['parser', 'ner'])
|
||||
|
||||
print("Loading initial wordlist")
|
||||
|
||||
def get_lines(filename):
|
||||
with gzip.open(filename, 'r') as f:
|
||||
ret = []
|
||||
for l in f:
|
||||
if len(ret) > 30_000:
|
||||
return ret
|
||||
ret.append(str(l).lower())
|
||||
return ret
|
||||
words = []
|
||||
with gzip.open("./00-frequency-list.csv.gz", 'r') as infile:
|
||||
for line in infile:
|
||||
words.append(line.decode('ascii').split(",")[0])
|
||||
|
||||
# Remove header
|
||||
words = words[1:]
|
||||
|
||||
print(words[0:5])
|
||||
|
||||
WORDLIST_SIZE = 8192 + 3
|
||||
word_re = re.compile(r"^[A-Za-z]+$")
|
||||
print("Lemmatizing words")
|
||||
|
||||
seen_lemmatizations = set()
|
||||
|
||||
print("Step 3")
|
||||
with open("./01-errored-lemmatized-words.csv", 'w') as erroutfile:
|
||||
erroutfile.write("WORD,ATTEMPTED_LEMMATIZATION,LEMMATIZER\n")
|
||||
|
||||
with gzip.open("./01-lemmatized-words.csv.gz", 'w') as outfile:
|
||||
outfile.write("WORD,LEMMATIZED_WORD,LEMMATIZER\n".encode("ascii"))
|
||||
|
||||
annotated_words=pd.read_excel("annotated_words.ods")
|
||||
iter = tqdm(words)
|
||||
|
||||
excluded_words = list(annotated_words[annotated_words["keep"] != "Yes"]["word"].str.lower())
|
||||
excluded_words[0:10]
|
||||
for word in iter:
|
||||
lemmatized_words = [
|
||||
# Wordnet
|
||||
(wordnet.lemmatize(word).upper(), 'WORDNET'),
|
||||
# Spacy
|
||||
(nlp(word)[0].lemma_.upper().upper(), 'SPACY'),
|
||||
]
|
||||
|
||||
custom_maps = annotated_words[annotated_words["maps_to"].notna()][["word","maps_to"]].assign(maps_to=lambda x: x["maps_to"].map(lambda y: y.split(",")))
|
||||
|
||||
custom_maps = [
|
||||
(m[1]["word"].lower(), mapping.lower())
|
||||
for m in custom_maps.iterrows()
|
||||
for mapping in m[1]["maps_to"]
|
||||
]
|
||||
custom_maps
|
||||
|
||||
|
||||
print("Step 4")
|
||||
|
||||
|
||||
# Start parsing the wordlist
|
||||
all_words = get_lines("00-frequency-all.txt.gz")
|
||||
|
||||
# Delete header line
|
||||
all_words = all_words[1:]
|
||||
|
||||
# Get only the word (fixed width)
|
||||
all_words = [w[13:36].strip() for w in all_words]
|
||||
|
||||
# Remove special characters
|
||||
all_words = [w for w in all_words if word_re.search(w)]
|
||||
|
||||
# Remove all removed words
|
||||
all_words = [w for w in all_words if w not in excluded_words]
|
||||
|
||||
# Add all custom mappings
|
||||
for m in list(sum(custom_maps, ())):
|
||||
if m[0] not in all_words:
|
||||
all_words.append(m[0])
|
||||
if m[1] not in all_words:
|
||||
all_words.append(m[1])
|
||||
|
||||
|
||||
print("Step 5")
|
||||
|
||||
# Lemmatize all words (plural -> singular)
|
||||
# lemmatize_mappings = [(w, nlp(w)[0].lemma_) for w in tqdm(all_words)]
|
||||
|
||||
with open("01-lemmatized-words.csv", "w") as f:
|
||||
f.write("word,lemmatized_word\n")
|
||||
|
||||
iter = tqdm(all_words[:1000])
|
||||
|
||||
for w in iter:
|
||||
lemmatized_word = nlp(w)[0].lemma_.upper()
|
||||
if lemmatized_word == w:
|
||||
for (lemmatized_word, lemmatizer) in lemmatized_words:
|
||||
if word == lemmatized_word:
|
||||
continue
|
||||
if lemmatized_word not in all_words:
|
||||
iter.write(f"{lemmatized_word} not in all_words")
|
||||
|
||||
f.write(f"{w},{lemmatized_word}\n")
|
||||
if (word, lemmatized_word) in seen_lemmatizations:
|
||||
continue
|
||||
|
||||
seen_lemmatizations.add((word, lemmatized_word))
|
||||
|
||||
if lemmatized_word not in words:
|
||||
iter.write(f"{lemmatized_word} ({lemmatizer}) not in all_words")
|
||||
erroutfile.write(f"{word},{lemmatized_word},{lemmatizer}\n")
|
||||
continue
|
||||
|
||||
iter.write(f"{word} => {lemmatized_word} ({lemmatizer}) added")
|
||||
|
||||
outfile.write(f"{word},{lemmatized_word},{lemmatizer}\n".encode("ascii"))
|
||||
|
1285
wordlist/01-lemmatized-words.txt
Normal file
1285
wordlist/01-lemmatized-words.txt
Normal file
File diff suppressed because it is too large
Load Diff
103
wordlist/02-custom-lemmatizations.csv
Normal file
103
wordlist/02-custom-lemmatizations.csv
Normal file
@ -0,0 +1,103 @@
|
||||
ADD,ADDS
|
||||
ADS,ADDS
|
||||
AFFECTED,EFFECT
|
||||
AFFECT,EFFECT
|
||||
AFFECTIONS,AFFECTION
|
||||
AFFECTIVE,EFFECT
|
||||
AFFECTS,EFFECT
|
||||
ALUMINIUM,ALUMINUM
|
||||
ALUMINUM,ALUMINIUM
|
||||
ANALYSE,ANALYZE
|
||||
ANALYSED,ANALYZE
|
||||
ANALYSES,ANALYZE
|
||||
AUX,OX
|
||||
BE,BEE
|
||||
BERRY,BARRY
|
||||
BLEW,BLUE
|
||||
BOT,BOUGHT
|
||||
BOULDER,BOLDER
|
||||
BRINGS,BRING
|
||||
BY,BYE
|
||||
CAPITOL,CAPITAL
|
||||
CENTS,SENSE
|
||||
CHILE,CHILI
|
||||
CHILE,CHILLY
|
||||
COLOURLESS,COLORLESS
|
||||
COM,CALM
|
||||
CORP,CORE
|
||||
CORPS,CORE
|
||||
CUE,QUEUE
|
||||
DAZE,DAY
|
||||
DAZED,DAY
|
||||
DEAR,DEER
|
||||
DESSERT,DESERT
|
||||
DEW,DO
|
||||
DEW,DUE
|
||||
DIED,DYED
|
||||
EFFECTIVE,EFFECT
|
||||
EFFECTS,EFFECT
|
||||
ELECTRONICS,ELECTRONIC
|
||||
FAVOUR,FAVOR
|
||||
FAX,FACTS
|
||||
FILING,FILLING
|
||||
FILINGS,FILLING
|
||||
FORTUNATELY,FORTUNATE
|
||||
FOUR,FOR
|
||||
GRATE,GREAT
|
||||
HAIRY,HARRY
|
||||
HARRY,HAIRY
|
||||
HEIR,HAIR
|
||||
HEIRS,HAIR
|
||||
HEM,HIM
|
||||
HONOUR,HONOR
|
||||
HONOURS,HONORS
|
||||
HYMN,HIM
|
||||
HYMNS,HIM
|
||||
IMPROVES,IMPROVE
|
||||
ISLE,AISLE
|
||||
KNIGHT,NIGHT
|
||||
KNOT,NOT
|
||||
KNOTS,NOT
|
||||
LARVAE,LARVA
|
||||
LECTURER,LECTURE
|
||||
MANOR,MANNER
|
||||
MONIES,MONEYS
|
||||
NEIGHBOURHOOD,NEIGHBORHOOD
|
||||
NEIGHBOUR,NEIGHBOR
|
||||
NEIGHBOURS,NEIGHBOR
|
||||
NOSE,KNOW
|
||||
NUN,NONE
|
||||
ORE,OAR
|
||||
ORE,OR
|
||||
ORGANISATIONAL,ORGANIZATIONAL
|
||||
ORGANISATION,ORGANIZATION
|
||||
ORGANISATIONS,ORGANIZATION
|
||||
OWE,OH
|
||||
PAR,PARSE
|
||||
PARS,PARSE
|
||||
PEOPLES,PEOPLE
|
||||
PER,PURR
|
||||
PETAL,PEDAL
|
||||
PROVIDES,PROVIDE
|
||||
RAP,WRAP
|
||||
REFORMED,REFORM
|
||||
SCENT,CENT
|
||||
SCENTS,CENT
|
||||
SENSE,CENT
|
||||
SENSED,CENT
|
||||
SENSES,CENT
|
||||
SIMULTANEOUSLY,SIMULTANEOUS
|
||||
TELECOMMUNICATIONS,TELECOMMUNICATION
|
||||
THEATRES,THEATER
|
||||
THEATRE,THEATER
|
||||
THRU,THROUGH
|
||||
VAPOUR,VAPOR
|
||||
VARY,VERY
|
||||
VERTEBRA,VERTEBRAE
|
||||
WEARY,WARY
|
||||
WEIGHS,WAY
|
||||
WEIGH,WAY
|
||||
YELLOW,HELLO
|
||||
CACHE,CASH
|
||||
BYTE,BITE
|
||||
COUNSELLOR,COUNSELOR
|
|
File diff suppressed because it is too large
Load Diff
8851
wordlist/04-deduplicated-words.csv
Normal file
8851
wordlist/04-deduplicated-words.csv
Normal file
File diff suppressed because it is too large
Load Diff
126
wordlist/04-deduplicated-words.py
Executable file
126
wordlist/04-deduplicated-words.py
Executable file
@ -0,0 +1,126 @@
|
||||
#!/usr/bin/env python3
|
||||
# coding: utf-8
|
||||
|
||||
import gzip
|
||||
from pprint import pprint
|
||||
from tqdm import tqdm
|
||||
|
||||
# 2**13 + 2 since two can be skipped
|
||||
WORDLIST_SIZE=8192+2
|
||||
|
||||
print("Loading full wordlist")
|
||||
|
||||
all_words = []
|
||||
with gzip.open("./00-frequency-list.csv.gz", 'r') as infile:
|
||||
first = True
|
||||
for line in infile:
|
||||
if first:
|
||||
first = False
|
||||
continue
|
||||
all_words.append(line.decode('ascii').split(",")[0])
|
||||
|
||||
print("Building lemmatization graph")
|
||||
|
||||
lemmatization_graph = list()
|
||||
def add_lemmatization(word1, word2):
|
||||
for lemmatization in lemmatization_graph:
|
||||
word1_contained = word1 in lemmatization
|
||||
word2_contained = word2 in lemmatization
|
||||
|
||||
if word1_contained or word2_contained:
|
||||
if word1_contained and word2_contained:
|
||||
print(f"Warning: lemmatization {word1}<=>{word2} already in set: {lemmatization}")
|
||||
|
||||
lemmatization.add(word1)
|
||||
lemmatization.add(word2)
|
||||
|
||||
# Success. We added the words
|
||||
return
|
||||
else:
|
||||
# This lemmatization doesn't contain either. This is the common case
|
||||
pass
|
||||
|
||||
# If we get here, there is no known lemmatization between these two. Add it
|
||||
lemmatization_graph.append(set((word1, word2)))
|
||||
|
||||
def get_lemmatization(word):
|
||||
for lemmatization in lemmatization_graph:
|
||||
if word in lemmatization:
|
||||
return lemmatization
|
||||
|
||||
print("\tAdding automatic lemmatizations")
|
||||
# First, iterate over automated lemmatizations
|
||||
with gzip.open("./01-lemmatized-words.csv.gz") as infile:
|
||||
first = True
|
||||
for line in infile:
|
||||
if first:
|
||||
first = False
|
||||
continue
|
||||
split = line.decode('ascii').strip().split(",")
|
||||
add_lemmatization(split[0], split[1])
|
||||
|
||||
print("\tAdding custom lemmatizations")
|
||||
# Next, iterate over manual lemmatizations
|
||||
with open("./02-custom-lemmatizations.csv") as infile:
|
||||
first = True
|
||||
for line in infile:
|
||||
if first:
|
||||
first = False
|
||||
continue
|
||||
split = line.strip().split(",")
|
||||
add_lemmatization(split[0], split[1])
|
||||
|
||||
print("Lemmatization graph constructed:")
|
||||
pprint(lemmatization_graph)
|
||||
|
||||
print("Loading exclude wordlist")
|
||||
with open("./03-exclude.csv") as infile:
|
||||
first = True
|
||||
exclude_words = set()
|
||||
for line in infile:
|
||||
if first:
|
||||
first = False
|
||||
continue
|
||||
exclude_words.add(line.strip())
|
||||
|
||||
# Now, start printing the first WORDLIST_SIZE elements
|
||||
seen_word_lemmatizations = set()
|
||||
final_wordlist = []
|
||||
ending_word_index = 0
|
||||
for word in all_words:
|
||||
ending_word_index += 1
|
||||
|
||||
word_lemmatizations = get_lemmatization(word)
|
||||
|
||||
if not word_lemmatizations:
|
||||
word_lemmatizations = set([word])
|
||||
|
||||
if len(word_lemmatizations - exclude_words) != len(word_lemmatizations) :
|
||||
print(f"Note: {word_lemmatizations} is excluded")
|
||||
continue
|
||||
|
||||
if word_lemmatizations in seen_word_lemmatizations:
|
||||
# We already added this one
|
||||
continue
|
||||
|
||||
final_wordlist.append(word_lemmatizations)
|
||||
|
||||
if len(final_wordlist) >= WORDLIST_SIZE:
|
||||
# We've added all the words we need
|
||||
break
|
||||
|
||||
assert len(final_wordlist) == WORDLIST_SIZE
|
||||
pprint(list(enumerate(final_wordlist)))
|
||||
print(f"Ending index: {ending_word_index}")
|
||||
|
||||
final_wordlist = [
|
||||
(idx + 1, word)
|
||||
for idx, words in enumerate(final_wordlist)
|
||||
for word in words
|
||||
]
|
||||
|
||||
with open("./04-deduplicated-words.csv", 'w') as outfile:
|
||||
outfile.write("WORD,NUMBER\n")
|
||||
for (idx, word) in final_wordlist:
|
||||
outfile.write(f"{word},{idx}\n")
|
||||
# all_words.append(line.decode('ascii').split(",")[0])
|
Binary file not shown.
@ -6,6 +6,7 @@ asttokens==2.2.1
|
||||
attrs==22.2.0
|
||||
backcall==0.2.0
|
||||
beautifulsoup4==4.11.2
|
||||
black==23.1.0
|
||||
bleach==6.0.0
|
||||
blis==0.7.9
|
||||
catalogue==2.0.8
|
||||
@ -33,6 +34,7 @@ ipywidgets==8.0.4
|
||||
isoduration==20.11.0
|
||||
jedi==0.18.2
|
||||
Jinja2==3.1.2
|
||||
joblib==1.2.0
|
||||
jsonpointer==2.3
|
||||
jsonschema==4.17.3
|
||||
jupyter==1.0.0
|
||||
@ -49,11 +51,13 @@ MarkupSafe==2.1.2
|
||||
matplotlib-inline==0.1.6
|
||||
mistune==2.0.5
|
||||
murmurhash==1.0.9
|
||||
mypy-extensions==1.0.0
|
||||
nbclassic==0.5.2
|
||||
nbclient==0.7.2
|
||||
nbconvert==7.2.9
|
||||
nbformat==5.7.3
|
||||
nest-asyncio==1.5.6
|
||||
nltk==3.8.1
|
||||
notebook==6.5.2
|
||||
notebook_shim==0.2.2
|
||||
numpy==1.24.2
|
||||
@ -66,6 +70,7 @@ packaging==23.0
|
||||
pandas==1.5.3
|
||||
pandocfilters==1.5.0
|
||||
parso==0.8.3
|
||||
pathspec==0.11.0
|
||||
pathy==0.10.1
|
||||
pexpect==4.8.0
|
||||
pickleshare==0.7.5
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -1,220 +0,0 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "991a711f-be98-4aae-a657-84b065449916",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"try:\n",
|
||||
" _initialized\n",
|
||||
"except:\n",
|
||||
" # !pip install spacy\n",
|
||||
" # !python -m spacy download en_core_web_trf\n",
|
||||
" import spacy\n",
|
||||
" \n",
|
||||
" nlp = spacy.load('en_core_web_trf', disable=['parser', 'ner'])\n",
|
||||
" \n",
|
||||
" _initialized=True\n",
|
||||
" \n",
|
||||
"import pandas as pd\n",
|
||||
"import gzip\n",
|
||||
"import re"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "d130bb84",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"def get_lines(filename):\n",
|
||||
" with gzip.open(filename, 'r') as f:\n",
|
||||
" ret = []\n",
|
||||
" for l in f:\n",
|
||||
" if len(ret) > 30_000:\n",
|
||||
" return ret\n",
|
||||
" ret.append(str(l).lower())\n",
|
||||
" return ret\n",
|
||||
"\n",
|
||||
"\n",
|
||||
" \n",
|
||||
"WORDLIST_SIZE = 8192 + 3\n",
|
||||
"word_re = re.compile(r\"^[A-Za-z]+$\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "de2d1731",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!pwd\n",
|
||||
"!ls"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "90665714",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"annotated_words=pd.read_excel(\"annotated_words.ods\")\n",
|
||||
"\n",
|
||||
"excluded_words = list(annotated_words[annotated_words[\"keep\"] != \"Yes\"][\"word\"].str.lower())\n",
|
||||
"excluded_words[0:10]\n",
|
||||
"\n",
|
||||
"custom_maps = annotated_words[annotated_words[\"maps_to\"].notna()][[\"word\",\"maps_to\"]].assign(maps_to=lambda x: x[\"maps_to\"].map(lambda y: y.split(\",\")))\n",
|
||||
"\n",
|
||||
"custom_maps = [\n",
|
||||
" (m[1][\"word\"].lower(), mapping.lower())\n",
|
||||
" for m in custom_maps.iterrows()\n",
|
||||
" for mapping in m[1][\"maps_to\"]\n",
|
||||
"]\n",
|
||||
"custom_maps"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "fb50c69e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Start parsing the wordlist\n",
|
||||
"all_words = get_lines(\"00-frequency-all.txt.gz\")\n",
|
||||
"\n",
|
||||
"# Delete header line\n",
|
||||
"all_words = all_words[1:]\n",
|
||||
"\n",
|
||||
"# Get only the word (fixed width)\n",
|
||||
"all_words = [w[13:36].strip() for w in all_words]\n",
|
||||
"\n",
|
||||
"# Remove special characters\n",
|
||||
"all_words = [w for w in all_words if word_re.search(w)]\n",
|
||||
"\n",
|
||||
"# Remove all removed words\n",
|
||||
"all_words = [w for w in all_words if w not in excluded_words]\n",
|
||||
"\n",
|
||||
"# Add all custom mappings\n",
|
||||
"for m in list(sum(custom_maps, ())):\n",
|
||||
" if m[0] not in all_words:\n",
|
||||
" all_words.append(m[0])\n",
|
||||
" if m[1] not in all_words:\n",
|
||||
" all_words.append(m[1])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"id": "cd21bff5",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Lemmatize all words (plural -> singular)\n",
|
||||
"lemmatize_mappings = [(w, nlp(w)[0].lemma_) for w in all_words[:100]]\n",
|
||||
"print(lemmatize_mappings[:100])\n",
|
||||
"\n",
|
||||
"# Add custom lemmatizations\n",
|
||||
"for l in custom_maps:\n",
|
||||
" if l in lemmatize_mappings:\n",
|
||||
" print(f\"Warning: {l} is already lemmatized\")\n",
|
||||
" else:\n",
|
||||
" lemmatize_mappings.append(l)\n",
|
||||
" \n",
|
||||
"print(lemmatize_mappings[:100])\n",
|
||||
"\n",
|
||||
"lemmatize_mappings = [w for w in lemmatize_mappings if w[1] not in excluded_words]\n",
|
||||
"print(lemmatize_mappings[:100])\n",
|
||||
"\n",
|
||||
"# Now, re-add all lematized words to the list of every word\n",
|
||||
"for w in sum(lemmatize_mappings, ()):\n",
|
||||
" if w not in all_words:\n",
|
||||
" print(w)\n",
|
||||
" all_words.append(w)\n",
|
||||
" \n",
|
||||
"lemmatize_mappings = {k: v for k, v in lemmatize_mappings}"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "0ee9af7d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"final_wordlist = []\n",
|
||||
"seen_lemmatizations = set()\n",
|
||||
"for w in all_words:\n",
|
||||
" lemmatized = lemmatize_mappings.get(w) or w\n",
|
||||
" if lemmatized in seen_lemmatizations:\n",
|
||||
" # The lemmatized version of this word was already seen\n",
|
||||
" continue\n",
|
||||
" else:\n",
|
||||
" # The lemmatized version hasn't been seen. We're good to add it\n",
|
||||
" final_wordlist.append([\n",
|
||||
" k\n",
|
||||
" for k\n",
|
||||
" in lemmatize_mappings.keys()\n",
|
||||
" if lemmatize_mappings[k] == lemmatized\n",
|
||||
" ])\n",
|
||||
" seen_lemmatizations.add(lemmatized)\n",
|
||||
"\n",
|
||||
" if len(final_wordlist) >= WORDLIST_SIZE:\n",
|
||||
" break\n",
|
||||
"\n",
|
||||
"# Now, convert it to the format (number, word)\n",
|
||||
"final_wordlist = [\n",
|
||||
" (idx, w)\n",
|
||||
" for idx, words in enumerate(final_wordlist)\n",
|
||||
" for w in words\n",
|
||||
"]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"id": "07c1293c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"print(len(lemmatize_mappings))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "19c255d0",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": []
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.11.2"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
@ -1,159 +0,0 @@
|
||||
#!/usr/bin/env python3
|
||||
# coding: utf-8
|
||||
|
||||
print("Step 1")
|
||||
|
||||
|
||||
try:
|
||||
_initialized
|
||||
except:
|
||||
# !pip install spacy
|
||||
# !python -m spacy download en_core_web_trf
|
||||
import spacy
|
||||
from tqdm import tqdm
|
||||
|
||||
nlp = spacy.load('en_core_web_trf', disable=['parser', 'ner'])
|
||||
|
||||
_initialized=True
|
||||
|
||||
import pandas as pd
|
||||
import gzip
|
||||
import re
|
||||
|
||||
|
||||
print("Step 2")
|
||||
|
||||
|
||||
def get_lines(filename):
|
||||
with gzip.open(filename, 'r') as f:
|
||||
ret = []
|
||||
for l in f:
|
||||
if len(ret) > 30_000:
|
||||
return ret
|
||||
ret.append(str(l).lower())
|
||||
return ret
|
||||
|
||||
|
||||
|
||||
WORDLIST_SIZE = 8192 + 3
|
||||
word_re = re.compile(r"^[A-Za-z]+$")
|
||||
|
||||
|
||||
print("Step 3")
|
||||
|
||||
|
||||
annotated_words=pd.read_excel("annotated_words.ods")
|
||||
|
||||
excluded_words = list(annotated_words[annotated_words["keep"] != "Yes"]["word"].str.lower())
|
||||
excluded_words[0:10]
|
||||
|
||||
custom_maps = annotated_words[annotated_words["maps_to"].notna()][["word","maps_to"]].assign(maps_to=lambda x: x["maps_to"].map(lambda y: y.split(",")))
|
||||
|
||||
custom_maps = [
|
||||
(m[1]["word"].lower(), mapping.lower())
|
||||
for m in custom_maps.iterrows()
|
||||
for mapping in m[1]["maps_to"]
|
||||
]
|
||||
custom_maps
|
||||
|
||||
|
||||
print("Step 4")
|
||||
|
||||
|
||||
# Start parsing the wordlist
|
||||
all_words = get_lines("00-frequency-all.txt.gz")
|
||||
|
||||
# Delete header line
|
||||
all_words = all_words[1:]
|
||||
|
||||
# Get only the word (fixed width)
|
||||
all_words = [w[13:36].strip() for w in all_words]
|
||||
|
||||
# Remove special characters
|
||||
all_words = [w for w in all_words if word_re.search(w)]
|
||||
|
||||
# Remove all removed words
|
||||
all_words = [w for w in all_words if w not in excluded_words]
|
||||
|
||||
# Add all custom mappings
|
||||
for m in list(sum(custom_maps, ())):
|
||||
if m[0] not in all_words:
|
||||
all_words.append(m[0])
|
||||
if m[1] not in all_words:
|
||||
all_words.append(m[1])
|
||||
|
||||
|
||||
print("Step 5")
|
||||
|
||||
|
||||
# Lemmatize all words (plural -> singular)
|
||||
lemmatize_mappings = [(w, nlp(w)[0].lemma_) for w in tqdm(all_words)]
|
||||
print(lemmatize_mappings[:100])
|
||||
|
||||
# Add custom lemmatizations
|
||||
for l in custom_maps:
|
||||
if l in lemmatize_mappings:
|
||||
print(f"Warning: {l} is already lemmatized")
|
||||
else:
|
||||
lemmatize_mappings.append(l)
|
||||
|
||||
print(lemmatize_mappings[:100])
|
||||
|
||||
lemmatize_mappings = [w for w in lemmatize_mappings if w[1] not in excluded_words]
|
||||
print(lemmatize_mappings[:100])
|
||||
|
||||
# Now, re-add all lematized words to the list of every word
|
||||
for w in sum(lemmatize_mappings, ()):
|
||||
if w not in all_words:
|
||||
print(w)
|
||||
all_words.append(w)
|
||||
|
||||
lemmatize_mappings = {k: v for k, v in lemmatize_mappings}
|
||||
|
||||
|
||||
print("Step 6")
|
||||
|
||||
|
||||
final_wordlist = []
|
||||
seen_lemmatizations = set()
|
||||
for w in all_words:
|
||||
lemmatized = lemmatize_mappings.get(w) or w
|
||||
if lemmatized in seen_lemmatizations:
|
||||
# The lemmatized version of this word was already seen
|
||||
continue
|
||||
else:
|
||||
# The lemmatized version hasn't been seen. We're good to add it
|
||||
final_wordlist.append([
|
||||
k
|
||||
for k
|
||||
in lemmatize_mappings.keys()
|
||||
if lemmatize_mappings[k] == lemmatized
|
||||
])
|
||||
seen_lemmatizations.add(lemmatized)
|
||||
|
||||
if len(final_wordlist) >= WORDLIST_SIZE:
|
||||
break
|
||||
|
||||
# Now, convert it to the format (number, word)
|
||||
final_wordlist = [
|
||||
(idx, w)
|
||||
for idx, words in enumerate(final_wordlist)
|
||||
for w in words
|
||||
]
|
||||
|
||||
|
||||
print("Step 7")
|
||||
|
||||
print(len(lemmatize_mappings))
|
||||
|
||||
print("Step 8")
|
||||
|
||||
with open("01-generated-wordlist.csv", "w") as f:
|
||||
f.write("word,number\n")
|
||||
|
||||
for w in final_wordlist:
|
||||
lemmatized = "" if not w[1] else w[1]
|
||||
f.write(f"{w[1].upper()},{w[0]}")
|
||||
f.write("\n")
|
||||
|
||||
print("Done")
|
Loading…
Reference in New Issue
Block a user