Dataset Overview
Once you’ve loaded your data using the helper functions, generate basic statistics:Copy
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from typing import Tuple, Optional
# Load your data (using helper functions from previous section)
df_analysis = normalize_df_column_names(merged_df)
# Basic dataset information
print("📊 Dataset Overview")
print(f"Total records: {len(df_analysis):,}")
print(f"Date range: {df_analysis['date'].min()} to {df_analysis['date'].max()}")
print(f"Unique tickers: {df_analysis['ticker'].nunique():,}")
print(f"Unique channels: {df_analysis['channel'].nunique():,}")
print(f"Unique videos: {df_analysis['video_id'].nunique():,}")
# Data quality check
total_mentions = len(df_analysis)
resolved_tickers = len(df_analysis[df_analysis['ticker'] != 'NOT_FOUND'])
print(f"Ticker resolution rate: {(resolved_tickers/total_mentions)*100:.1f}%")
Visualization Functions
1. Ticker Mentions Over Time
Analyze how ticker mentions change over different time periods:Copy
def plot_total_tickers_over_period(
df_plt: pd.DataFrame,
binning_size_months: int,
ticker_filter: str = None
) -> tuple:
"""
Plot the total number of ticker mentions over a specified period.
Args:
df_plt (pd.DataFrame): The DataFrame containing the data to plot.
binning_size_months (int): The number of months to group by (1=monthly, 3=quarterly, 12=yearly).
ticker_filter (str): Optional ticker symbol to filter by (e.g., 'AAPL').
Returns:
tuple: (DataFrame with counts, plot title)
"""
if ticker_filter and ticker_filter != '':
# Filter the DataFrame by the selected ticker
df_plt = df_plt[df_plt['entity_symbol'] == ticker_filter]
plot_title = f'Total Mentions of {ticker_filter} by {binning_size_months} Month Period'
else:
plot_title = f'Total Ticker Mentions per {binning_size_months} Month Period'
# Extract year and period (using the month and grouping every n months)
df_plt['date'] = pd.to_datetime(df_plt['date'])
df_plt['year'] = df_plt['date'].dt.year
df_plt['n_month'] = (df_plt['date'].dt.month - 1) // binning_size_months + 1
# Create period label
df_plt['n_month_label'] = df_plt['year'].astype(str) + ' - ' + df_plt['n_month'].astype(str)
# Group by periods and count ticker mentions
n_monthly_counts = df_plt.groupby('n_month_label')['entity_symbol'].count().reset_index(name='count')
return n_monthly_counts, plot_title
# Example usage
df_plot = df_analysis.copy()
binning_size_months = 3 # Quarterly analysis
ticker_filter = 'AAPL' # Focus on Apple, or set to None for all tickers
n_monthly_counts, plot_title = plot_total_tickers_over_period(
df_plot, binning_size_months, ticker_filter
)
# Create the plot
plt.figure(figsize=(12, 6))
sns.barplot(data=n_monthly_counts, x='n_month_label', y='count', color='skyblue')
plt.xticks(rotation=45)
plt.title(plot_title)
plt.xlabel('Period')
plt.ylabel('Mention Count')
plt.tight_layout()
plt.show()
2. Ticker Distribution Analysis
Understand which tickers are mentioned most frequently:Copy
def analyze_ticker_distribution(df_analysis, min_count=10):
"""
Analyze the distribution of ticker mentions.
Args:
df_analysis (pd.DataFrame): Your normalized dataset
min_count (int): Minimum mentions to include in analysis
"""
# Count ticker frequencies
ticker_counts = df_analysis['ticker'].value_counts().reset_index()
ticker_counts.columns = ['ticker', 'count']
# Filter out tickers with low counts
ticker_counts = ticker_counts[ticker_counts['count'] >= min_count]
# Summary statistics
most_common = ticker_counts.iloc[0]
least_common = ticker_counts.iloc[-1]
unique_tickers = ticker_counts['ticker'].nunique()
print(f"📈 Ticker Distribution (≥{min_count} mentions)")
print(f"Unique tickers: {unique_tickers:,}")
print(f"Most mentioned: {most_common['ticker']} ({most_common['count']:,} mentions)")
print(f"Least mentioned: {least_common['ticker']} ({least_common['count']:,} mentions)")
# Plot distribution
plt.figure(figsize=(14, 6))
sns.barplot(data=ticker_counts, x='ticker', y='count', color='skyblue')
# Remove x-axis labels for clarity
plt.xticks([])
plt.xlabel('Tickers')
plt.ylabel('Mention Count (Log Scale)')
plt.title(f'Ticker Mention Distribution (Unique: {unique_tickers}, ≥{min_count} mentions)\n'
f'Most: {most_common["ticker"]} ({most_common["count"]:,}) | '
f'Least: {least_common["ticker"]} ({least_common["count"]:,})')
plt.yscale('log')
plt.tight_layout()
plt.show()
return ticker_counts
# Run the analysis
ticker_distribution = analyze_ticker_distribution(df_analysis, min_count=10)
3. Top Channels Analysis
Identify the most active channels in your dataset:Copy
def analyze_top_channels(df_analysis, n_channels=10):
"""
Analyze the top channels by ticker mention count.
Args:
df_analysis (pd.DataFrame): Your normalized dataset
n_channels (int): Number of top channels to display
"""
# Get top channels by ticker count
top_channels = df_analysis['channel'].value_counts().head(n_channels).index
df_top_channels = df_analysis[df_analysis['channel'].isin(top_channels)]
# Calculate channel statistics
channel_counts = df_top_channels['channel'].value_counts().reset_index()
channel_counts.columns = ['channel', 'count']
# Find most common ticker per channel
most_common_ticker = (
df_top_channels.groupby('channel')['ticker']
.agg(lambda x: x.value_counts().idxmax())
.reset_index()
)
most_common_ticker.columns = ['channel', 'top_ticker']
channel_counts = channel_counts.merge(most_common_ticker, on='channel')
print(f"📺 Top {n_channels} Channels by Ticker Mentions")
for _, row in channel_counts.head(5).iterrows():
print(f" {row['channel']}: {row['count']:,} mentions (top ticker: {row['top_ticker']})")
# Plot channel analysis
plt.figure(figsize=(14, 8))
ax = sns.barplot(data=channel_counts, y='channel', x='count', color='skyblue')
# Add ticker labels to bars
for i, (count, ticker) in enumerate(zip(channel_counts['count'], channel_counts['top_ticker'])):
ax.text(count + max(channel_counts['count'])*0.01, i, ticker, va='center', fontsize=10)
plt.title(f'Top {n_channels} Channels by Ticker Mentions\n(Labeled with Most Common Ticker)')
plt.xlabel('Mention Count')
plt.ylabel('Channel')
plt.tight_layout()
plt.show()
return channel_counts
# Run the analysis
top_channels = analyze_top_channels(df_analysis, n_channels=8)
4. Temporal Trends Analysis
Analyze trends over time for top tickers:Copy
def analyze_temporal_trends(df_analysis, start_year=2017, end_year=2024, top_n=5):
"""
Analyze temporal trends for top tickers.
Args:
df_analysis (pd.DataFrame): Your normalized dataset
start_year (int): Start year for analysis
end_year (int): End year for analysis
top_n (int): Number of top tickers to track
"""
# Prepare data
df_temporal = df_analysis.copy()
df_temporal['date'] = pd.to_datetime(df_temporal['date'])
df_temporal['year'] = df_temporal['date'].dt.year
# Filter by year range
df_temporal = df_temporal[
(df_temporal['year'] >= start_year) &
(df_temporal['year'] <= end_year)
]
# Get top tickers per year
top_per_year = (
df_temporal.groupby(['year', 'ticker']).size()
.reset_index(name='count')
.sort_values(['year', 'count'], ascending=[True, False])
)
top_tickers_by_year = top_per_year.groupby('year').head(top_n)
# Plot trends
plt.figure(figsize=(12, 6))
for ticker, group in top_tickers_by_year.groupby('ticker'):
group = group.sort_values('year')
plt.plot(group['year'], group['count'], marker='o', label=ticker, linewidth=2)
plt.legend(title='Tickers', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.title(f'Top {top_n} Tickers per Year ({start_year}-{end_year}) by Mention Count')
plt.xlabel('Year')
plt.ylabel('Mention Count')
plt.xticks(sorted(df_temporal['year'].unique()))
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
return top_tickers_by_year
# Run temporal analysis
temporal_trends = analyze_temporal_trends(df_analysis, start_year=2018, end_year=2024)
Sentiment Analysis
If your dataset includes sentiment data, analyze sentiment patterns:Copy
def analyze_sentiment_patterns(df_analysis):
"""
Analyze sentiment patterns in the dataset.
Args:
df_analysis (pd.DataFrame): Your normalized dataset with sentiment columns
"""
if 'entity_sentiment_overt_buy_sell' not in df_analysis.columns:
print("⚠️ Sentiment columns not found in dataset")
return
# Create sentiment matrix
sentiment_matrix = df_analysis.groupby([
'entity_sentiment_overt_buy_sell',
'entity_sentiment_generic'
]).size().unstack(fill_value=0)
# Plot sentiment heatmap
plt.figure(figsize=(10, 6))
sns.heatmap(
sentiment_matrix,
annot=True,
fmt='d',
cmap='Blues',
cbar_kws={'label': 'Count'}
)
plt.title('Sentiment Analysis Matrix')
plt.xlabel('Generic Sentiment')
plt.ylabel('Buy/Sell Sentiment')
plt.tight_layout()
plt.show()
# Print summary
total_sentiment_records = sentiment_matrix.sum().sum()
print(f"📊 Sentiment Summary")
print(f"Total records with sentiment: {total_sentiment_records:,}")
print(f"Sentiment coverage: {(total_sentiment_records/len(df_analysis))*100:.1f}%")
return sentiment_matrix
# Run sentiment analysis (if sentiment columns exist)
try:
sentiment_analysis = analyze_sentiment_patterns(df_analysis)
except Exception as e:
print(f"Sentiment analysis skipped: {e}")
Advanced Analysis Functions
5. Polarizing Tickers Analysis
Identify the most controversial or polarizing tickers based on sentiment patterns:Copy
def analyze_polarizing_tickers(df_analysis, min_mentions=50):
"""
Analyze which tickers generate the most polarizing sentiment.
Args:
df_analysis (pd.DataFrame): Your normalized dataset with sentiment columns
min_mentions (int): Minimum mentions required to include ticker in analysis
Returns:
pd.DataFrame: DataFrame with polarization metrics
"""
if 'entity_sentiment_overt_buy_sell' not in df_analysis.columns:
print("⚠️ Sentiment columns not found in dataset")
return pd.DataFrame()
sentiment_df = df_analysis.copy()
# Remove rows where ticker is 'NOT_FOUND' or missing
sentiment_df = sentiment_df[
(sentiment_df['ticker'] != 'NOT_FOUND') &
(sentiment_df['ticker'].notna())
]
# Create binary sentiment indicators
sentiment_df['overt_positive'] = sentiment_df['entity_sentiment_overt_buy_sell'].apply(
lambda x: 1 if x == 'POSITIVE' else 0
)
sentiment_df['overt_negative'] = sentiment_df['entity_sentiment_overt_buy_sell'].apply(
lambda x: 1 if x == 'NEGATIVE' else 0
)
sentiment_df['generic_positive'] = sentiment_df['entity_sentiment_generic'].apply(
lambda x: 1 if x == 'POSITIVE' else 0
)
sentiment_df['generic_negative'] = sentiment_df['entity_sentiment_generic'].apply(
lambda x: 1 if x == 'NEGATIVE' else 0
)
# Aggregate by ticker
sentiment_agg = sentiment_df.groupby('ticker').agg({
'overt_positive': 'sum',
'overt_negative': 'sum',
'generic_positive': 'sum',
'generic_negative': 'sum',
'entity_symbol': 'count'
}).reset_index()
# Calculate polarization ratios
sentiment_agg['overt_ratio'] = sentiment_agg[['overt_positive', 'overt_negative']].min(axis=1) / sentiment_agg['entity_symbol']
sentiment_agg['generic_ratio'] = sentiment_agg[['generic_positive', 'generic_negative']].min(axis=1) / sentiment_agg['entity_symbol']
# Rename mentions column
sentiment_agg.rename(columns={'entity_symbol': 'mentions_count'}, inplace=True)
# Filter by minimum mentions and sort by polarization
sentiment_agg = sentiment_agg[sentiment_agg['mentions_count'] >= min_mentions]
sentiment_agg = sentiment_agg.sort_values(by=['overt_ratio', 'generic_ratio'], ascending=[False, False])
# Get top 10 most polarizing tickers
top_controversial = sentiment_agg.head(10)
print(f"📊 Most Polarizing Tickers (≥{min_mentions} mentions)")
for _, row in top_controversial.head(5).iterrows():
pos_pct = (row['overt_positive'] / row['mentions_count']) * 100
neg_pct = (row['overt_negative'] / row['mentions_count']) * 100
print(f" {row['ticker']}: {pos_pct:.1f}% positive, {neg_pct:.1f}% negative ({row['mentions_count']} total)")
# Plot polarizing tickers
plt.figure(figsize=(14, 8))
plt.bar(top_controversial['ticker'], top_controversial['overt_positive'],
color='blue', label='Overt Positive')
plt.bar(top_controversial['ticker'], -top_controversial['overt_negative'],
color='red', label='Overt Negative')
plt.title('Most Polarizing Tickers by Sentiment Distribution')
plt.xlabel('Ticker')
plt.ylabel('Sentiment Count')
plt.xticks(rotation=45)
plt.legend()
plt.tight_layout()
plt.show()
return sentiment_agg
# Run polarizing tickers analysis
try:
polarizing_analysis = analyze_polarizing_tickers(df_analysis, min_mentions=50)
except Exception as e:
print(f"Polarizing tickers analysis skipped: {e}")
6. Speaker Analysis
Analyze speakers:Copy
# Speaker Analysis
df_speakers = df_visuals_channels.copy()
speakers_and_entity_counts = df_speakers.groupby(['speaker_name', 'speaker_associated_entity', 'speaker_role_context']).size().reset_index(name='count')
# Top hosts (where speaker_role_context == 'host')
top_hosts = speakers_and_entity_counts[speakers_and_entity_counts['speaker_role_context'] == 'host']
top_hosts = top_hosts.sort_values(by='count', ascending=False).head(10)
# Top guests (where speaker_role_context == 'guest')
top_guests = speakers_and_entity_counts[speakers_and_entity_counts['speaker_role_context'] == 'guest']
top_guests = top_guests.sort_values(by='count', ascending=False).head(10)
# Top quoted clips (where speaker_role_context == 'quoted clip')
top_quoted_clips = speakers_and_entity_counts[speakers_and_entity_counts['speaker_role_context'] == 'quoted clip']
top_quoted_clips = top_quoted_clips.sort_values(by='count', ascending=False).head(10)
# Graph each of these top speaker categories
plt.figure(figsize=(14, 8))
# Plot the top hosts
plt.subplot(3, 1, 1)
plt.barh(top_hosts['speaker_name'], top_hosts['count'], color='skyblue')
plt.title('Top 10 Hosts by Count')
plt.xlabel('Count')
plt.ylabel('Host Name')
plt.xticks(rotation=45)
plt.tight_layout()
plt.subplot(3, 1, 2)
# Plot the top guests
plt.barh(top_guests['speaker_name'], top_guests['count'], color='skyblue')
plt.title('Top 10 Guests by Count')
plt.xlabel('Count')
plt.ylabel('Guest Name')
plt.xticks(rotation=45)
plt.tight_layout()
plt.subplot(3, 1, 3)
# Plot the top quoted clips
plt.barh(top_quoted_clips['speaker_name'], top_quoted_clips['count'], color='skyblue')
plt.title('Top 10 Quoted Clips by Count')
plt.xlabel('Count')
plt.ylabel('Quoted Clip Name')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
# Show top 20 guests with their associated entity names
top_guests = speakers_and_entity_counts[speakers_and_entity_counts['speaker_role_context'] == 'guest']
# Add the entity name to the guest name in a separate column
top_guests['guest_with_entity'] = top_guests['speaker_name'] + ' (' + top_guests['speaker_associated_entity'].str.slice(0, 25) + ')'
# Sort by count and take the top 20
top_guests = top_guests.sort_values(by='count', ascending=False).head(20)
# Plot the top 20 guests with their associated entity names
plt.figure(figsize=(14, 8))
plt.barh(top_guests['guest_with_entity'], top_guests['count'], color='skyblue')
plt.title('Top 20 Guests by Count with Associated Entity Names')
plt.xlabel('Count')
plt.ylabel('Guest Name (Associated Entity)')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
# Unique counts of guests and hosts
vid_count = df_visuals_channels['video_id'].nunique()
# Get the unique counts of guests and hosts
unique_guests = speakers_and_entity_counts[speakers_and_entity_counts['speaker_role_context'] == 'guest']['speaker_name'].nunique()
unique_hosts = speakers_and_entity_counts[speakers_and_entity_counts['speaker_role_context'] == 'host']['speaker_name'].nunique()
unique_hosts_ratio = unique_hosts / (vid_count)
unique_guests_ratio = unique_guests / (vid_count)
print(f"Unique Hosts: {unique_hosts} ({unique_hosts_ratio:.1%})")
print(f"Unique Guests: {unique_guests} ({unique_guests_ratio:.1%})")
7. Ticker-Channel Relationships
Find which channels mentioned specific tickers most:Copy
# For a given ticker, which channel mentioned it the most
ticker_queries = [
"CMCSA",
"ULTA",
"TSCO"
]
df_plt6 = df_visuals_channels.copy()
df_plt6 = df_plt6[(df_plt6['ticker'] != 'NOT_FOUND') & (df_plt6['ticker'].notna())]
for ticker_query in ticker_queries:
# Get the channel that mentioned the ticker the most
df_ticker = df_plt6[df_plt6['ticker'] == ticker_query]
top_channel = df_ticker['channel'].value_counts().idxmax()
mention_count = df_ticker['channel'].value_counts().max()
print(f"Channel that mentioned {ticker_query} the most: {top_channel} ({mention_count} times)")
Year-Based Analysis
For a given year, list the top tickers and channels:Copy
# Customizable variables
target_year = 2022
top_n = 5
df_plt5 = df_visuals_channels.copy()
# Filter the data to include only data from target_year
df_plt5['date'] = pd.to_datetime(df_plt5['date'])
df_plt5['year'] = df_plt5['date'].dt.year
df_year = df_plt5[df_plt5['year'] == target_year]
# Top N tickers
top_tickers = df_year['ticker'].value_counts().head(top_n).index
df_top_tickers = df_year[df_year['ticker'].isin(top_tickers)]
# Get top_n channels mentioning these
top_channels = df_top_tickers['channel'].value_counts().head(10)
print("Top Tickers:")
print(df_year['ticker'].value_counts().head(top_n))
print("\nTop 10 Channels mentioning those tickers:")
print(top_channels)
Next Steps
With your summary statistics generated, you can:- YouTube Core Dataset Overview - Learn about entity-focused analysis features
- YouTube Extended Dataset Overview - Discover complete transcript analysis capabilities
- Data Dictionaries - Complete field definitions for both datasets
Save your analysis code in a Jupyter notebook for easy reuse and sharing with your team.