>>> from techminer2.topic_modeling import terms_by_cluster_frame
>>> from sklearn.decomposition import LatentDirichletAllocation
>>> terms_by_cluster_frame(
... field="author_keywords",
... #
... # TF PARAMS:
... is_binary=True,
... cooc_within=3,
... #
... # TF-IDF PARAMS:
... norm=None,
... use_idf=False,
... smooth_idf=False,
... sublinear_tf=False,
... #
... # ITEM FILTERS:
... top_n=None,
... occ_range=(None, None),
... gc_range=(None, None),
... custom_terms=None,
... #
... # ESTIMATOR:
... sklearn_estimator=LatentDirichletAllocation(
... n_components=10,
... learning_decay=0.7,
... learning_offset=50.0,
... max_iter=10,
... batch_size=128,
... evaluate_every=-1,
... perp_tol=0.1,
... mean_change_tol=0.001,
... max_doc_update_iter=100,
... random_state=0,
... ),
... #
... # DATABASE PARAMS:
... root_dir="example/",
... database="main",
... year_filter=(None, None),
... cited_by_filter=(None, None),
... ).head()
cluster 0 ... 9
term ...
0 FINTECH 31:5168 ... FINTECH 31:5168
1 FINANCIAL_SERVICES 04:0667 ... INNOVATION 07:0911
2 FINANCIAL_TECHNOLOGY 03:0461 ... FINANCIAL_INCLUSION 03:0590
3 INNOVATION 07:0911 ... MOBILE_PAYMENT 02:0184
4 SERVICE_INNOVATION_STRATEGY 01:0079 ... CASE_STUDY 02:0340
[5 rows x 10 columns]