>>> from techminer2.topic_modeling import documents_by_theme_frame
>>> from sklearn.decomposition import LatentDirichletAllocation
>>> documents_by_theme_frame(
... field="author_keywords",
... #
... # TF PARAMS:
... is_binary=True,
... cooc_within=2,
... #
... # TF-IDF PARAMS:
... norm=None,
... use_idf=False,
... smooth_idf=False,
... sublinear_tf=False,
... #
... # TOP TERMS:
... n_top_terms=5,
... #
... # ITEM FILTERS:
... top_n=None,
... occ_range=(None, None),
... gc_range=(None, None),
... custom_terms=None,
... #
... # ESTIMATOR:
... sklearn_estimator=LatentDirichletAllocation(
... n_components=10,
... learning_decay=0.7,
... learning_offset=50.0,
... max_iter=10,
... batch_size=128,
... evaluate_every=-1,
... perp_tol=0.1,
... mean_change_tol=0.001,
... max_doc_update_iter=100,
... random_state=0,
... ),
... #
... # DATABASE PARAMS:
... root_dir="example/",
... database="main",
... year_filter=(None, None),
... cited_by_filter=(None, None),
... ).head()
cluster 0 ... 9
article ...
Anagnostopoulos I., 2018, J ECON BUS, V100, P7 0.871422 ... 0.014286
Anshari M., 2019, ENERGY PROCEDIA, V156, P234 0.014286 ... 0.014286
Buchak G., 2018, J FINANC ECON, V130, P453 0.014287 ... 0.014287
Cai C.W., 2018, ACCOUNT FINANC, V58, P965 0.020003 ... 0.020001
Chen L., 2016, CHINA ECON J, V9, P225 0.025005 ... 0.774979
[5 rows x 10 columns]