import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
from torch.nn.functional import softmax
from nltk.sentiment import SentimentIntensityAnalyzer
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('vader_lexicon')
model = AutoModelForSequenceClassification.from_pretrained('ProsusAI/finBERT')
tokenizer = AutoTokenizer.from_pretrained('ProsusAI/finBERT')
sia = SentimentIntensityAnalyzer()
keywords_df = pd.read_excel('keywords.xlsx')
ads_df = pd.read_excel('ads.xlsx')
def split_text(text, chunk_size):
return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]
def analyze_sentiment(text):
inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
outputs = model(**inputs)
probabilities = softmax(outputs.logits, dim=1)
return probabilities[0]
for column in ads_df.columns:
counts_df = pd.DataFrame(columns=['Key Word Category', 'Keyword', 'Paragraph', 'Sentiment Score', 'Magnitude'])
for index, row in keywords_df.iterrows():
keyword = row['Key Words/Topics']
category = row['Key Word Category']
paragraphs = ads_df[column].apply(lambda x: str(x) if keyword.lower() in str(x).lower() else None).dropna()
for paragraph in paragraphs:
chunks = split_text(paragraph, 1024)
for chunk in chunks:
probabilities = analyze_sentiment(chunk)
sentiment_score = (probabilities[1] + (probabilities[2] * 2) + (probabilities[0] * 3)) - 2
sentences = sent_tokenize(chunk)
magnitudes = []
for sentence in sentences:
sentence_probabilities = analyze_sentiment(sentence)
sentence_sentiment_score = (sentence_probabilities[1] + (sentence_probabilities[2] * 2) + (sentence_probabilities[0] * 3)) - 2
sentence_magnitude = abs(sia.polarity_scores(sentence)['compound'])
magnitudes.append(sentence_magnitude)
total_magnitude = sum(magnitudes)
new_row = {'Key Word Category': category, 'Keyword': keyword, 'Paragraph': chunk, 'Sentiment Score': sentiment_score.item(), 'Magnitude': total_magnitude}
counts_df = pd.concat([counts_df, pd.DataFrame([new_row])], ignore_index=True)
counts_df.to_excel(f'output_{column}.xlsx', index=False)
No comments:
Post a Comment