Term Occurrence by ClusterΒΆ

Example

>>> from sklearn.cluster import KMeans
>>> from techminer2.packages.document_clustering import TermOccurrenceByCluster
>>> # Initialize the clustering algorithm
>>> kmeans = KMeans(
...     n_clusters=8,
...     init="k-means++",
...     n_init=10,
...     max_iter=300,
...     tol=0.0001,
...     algorithm="lloyd",
...     random_state=0,
... )
>>> # Generate term occurrence by cluster data frame
>>> df = (
...     TermOccurrenceByCluster()
...     #
...     # FIELD:
...     .with_field("raw_keywords")
...     .having_terms_in_top(100)
...     .having_terms_ordered_by("OCC")
...     .having_term_occurrences_between(None, None)
...     .having_term_citations_between(None, None)
...     .having_terms_in(None)
...     #
...     # COUNTERS:
...     .using_term_counters(True)
...     #
...     # TFIDF:
...     .using_binary_term_frequencies(False)
...     .using_row_normalization(None)
...     .using_idf_reweighting(False)
...     .using_idf_weights_smoothing(False)
...     .using_sublinear_tf_scaling(False)
...     #
...     # CLUSTERING:
...     .using_clustering_algorithm_or_dict(kmeans)
...     #
...     # DATABASE:
...     .where_root_directory_is("example/")
...     .where_database_is("main")
...     .where_record_years_range_is(None, None)
...     .where_record_citations_range_is(None, None)
...     .where_records_match(None)
...     #
...     .run()
... ).head(20)
>>> # Display the resulting data frame
>>> print(df)  
cluster                                0   1  2  3  4  5  6  7
raw_keywords
FINTECH 32:5393                        1  23  1  2  1  1  0  3
FINANCE 11:1950                        4   3  0  1  1  1  1  0
INNOVATION 08:0990                     0   4  0  4  0  0  0  0
FINANCIAL_SERVICES 05:0746             0   2  0  3  0  0  0  0
FINANCIAL_SERVICE 04:1036              1   0  0  1  0  1  1  0
BUSINESS_MODELS 03:1335                0   2  0  0  0  0  1  0
BLOCKCHAIN 03:0881                     0   2  0  0  0  0  1  0
COMMERCE 03:0846                       1   1  0  0  0  0  1  0
FINANCIAL_INCLUSION 03:0590            0   2  1  0  0  0  0  0
FINANCIAL_INSTITUTION 03:0488          0   2  0  1  0  0  0  0
SURVEYS 03:0484                        0   2  0  0  0  1  0  0
FINANCIAL_TECHNOLOGY 03:0461           1   2  0  0  0  0  0  0
BANKING 03:0370                        0   0  0  3  0  0  0  0
CROWDFUNDING 03:0335                   1   2  0  0  0  0  0  0
MARKETPLACE_LENDING 03:0317            0   0  0  0  0  0  0  3
ELECTRONIC_MONEY 03:0305               1   1  0  1  0  0  0  0
SUSTAINABILITY 03:0227                 3   0  0  0  0  0  0  0
SUSTAINABLE_DEVELOPMENT 03:0227        3   0  0  0  0  0  0  0
FINANCIAL_SERVICES_INDUSTRIES 02:0696  1   0  0  0  0  0  1  0
LITERATURE_REVIEW 02:0560              1   1  0  0  0  0  0  0