import pandas as pd
import nltk
from nltk.corpus import stopwords as nltk_stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
data = pd.read_csv("/datasets/tweets_lemm.csv")
corpus = data['lemm_text'].values.astype('U')
nltk.download('stopwords')
stopwords = set(nltk_stopwords.words('russian'))
count_tf_idf = TfidfVectorizer(stop_words=stopwords)
tf_idf = count_tf_idf.fit_transform(corpus)
print("Размер матрицы:", tf_idf.shape)