Skip to main content
This page provides comprehensive statistical analysis and visualizations for your YouTube Core and YouTube Extended datasets. Use these tools to understand your data distribution, identify trends, and get quick insights.

Dataset Overview

Once you’ve loaded your data using the helper functions, generate basic statistics:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from typing import Tuple, Optional

# Load your data (using helper functions from previous section)
df_analysis = normalize_df_column_names(merged_df)

# Basic dataset information
print("📊 Dataset Overview")
print(f"Total records: {len(df_analysis):,}")
print(f"Date range: {df_analysis['date'].min()} to {df_analysis['date'].max()}")
print(f"Unique tickers: {df_analysis['ticker'].nunique():,}")
print(f"Unique channels: {df_analysis['channel'].nunique():,}")
print(f"Unique videos: {df_analysis['video_id'].nunique():,}")

# Data quality check
total_mentions = len(df_analysis)
resolved_tickers = len(df_analysis[df_analysis['ticker'] != 'NOT_FOUND'])
print(f"Ticker resolution rate: {(resolved_tickers/total_mentions)*100:.1f}%")

Visualization Functions

1. Ticker Mentions Over Time

Analyze how ticker mentions change over different time periods:
def plot_total_tickers_over_period(
    df_plt: pd.DataFrame, 
    binning_size_months: int, 
    ticker_filter: str = None
) -> tuple:   
    """
    Plot the total number of ticker mentions over a specified period.
    
    Args:
        df_plt (pd.DataFrame): The DataFrame containing the data to plot.
        binning_size_months (int): The number of months to group by (1=monthly, 3=quarterly, 12=yearly).
        ticker_filter (str): Optional ticker symbol to filter by (e.g., 'AAPL').
    
    Returns:
        tuple: (DataFrame with counts, plot title)
    """
    if ticker_filter and ticker_filter != '':
        # Filter the DataFrame by the selected ticker
        df_plt = df_plt[df_plt['entity_symbol'] == ticker_filter]
        plot_title = f'Total Mentions of {ticker_filter} by {binning_size_months} Month Period'
    else: 
        plot_title = f'Total Ticker Mentions per {binning_size_months} Month Period'

    # Extract year and period (using the month and grouping every n months)
    df_plt['date'] = pd.to_datetime(df_plt['date'])
    df_plt['year'] = df_plt['date'].dt.year
    df_plt['n_month'] = (df_plt['date'].dt.month - 1) // binning_size_months + 1

    # Create period label
    df_plt['n_month_label'] = df_plt['year'].astype(str) + ' - ' + df_plt['n_month'].astype(str)

    # Group by periods and count ticker mentions
    n_monthly_counts = df_plt.groupby('n_month_label')['entity_symbol'].count().reset_index(name='count')

    return n_monthly_counts, plot_title

# Example usage
df_plot = df_analysis.copy()
binning_size_months = 3  # Quarterly analysis
ticker_filter = 'AAPL'  # Focus on Apple, or set to None for all tickers

n_monthly_counts, plot_title = plot_total_tickers_over_period(
    df_plot, binning_size_months, ticker_filter
)

# Create the plot
plt.figure(figsize=(12, 6))
sns.barplot(data=n_monthly_counts, x='n_month_label', y='count', color='skyblue')
plt.xticks(rotation=45)
plt.title(plot_title)
plt.xlabel('Period')
plt.ylabel('Mention Count')
plt.tight_layout()
plt.show()

2. Ticker Distribution Analysis

Understand which tickers are mentioned most frequently:
def analyze_ticker_distribution(df_analysis, min_count=10):
    """
    Analyze the distribution of ticker mentions.
    
    Args:
        df_analysis (pd.DataFrame): Your normalized dataset
        min_count (int): Minimum mentions to include in analysis
    """
    # Count ticker frequencies
    ticker_counts = df_analysis['ticker'].value_counts().reset_index()
    ticker_counts.columns = ['ticker', 'count']
    
    # Filter out tickers with low counts
    ticker_counts = ticker_counts[ticker_counts['count'] >= min_count]
    
    # Summary statistics
    most_common = ticker_counts.iloc[0]
    least_common = ticker_counts.iloc[-1]
    unique_tickers = ticker_counts['ticker'].nunique()
    
    print(f"📈 Ticker Distribution (≥{min_count} mentions)")
    print(f"Unique tickers: {unique_tickers:,}")
    print(f"Most mentioned: {most_common['ticker']} ({most_common['count']:,} mentions)")
    print(f"Least mentioned: {least_common['ticker']} ({least_common['count']:,} mentions)")
    
    # Plot distribution
    plt.figure(figsize=(14, 6))
    sns.barplot(data=ticker_counts, x='ticker', y='count', color='skyblue')
    
    # Remove x-axis labels for clarity
    plt.xticks([])
    plt.xlabel('Tickers')
    plt.ylabel('Mention Count (Log Scale)')
    plt.title(f'Ticker Mention Distribution (Unique: {unique_tickers}, ≥{min_count} mentions)\n'
              f'Most: {most_common["ticker"]} ({most_common["count"]:,}) | '
              f'Least: {least_common["ticker"]} ({least_common["count"]:,})')
    plt.yscale('log')
    plt.tight_layout()
    plt.show()
    
    return ticker_counts

# Run the analysis
ticker_distribution = analyze_ticker_distribution(df_analysis, min_count=10)

3. Top Channels Analysis

Identify the most active channels in your dataset:
def analyze_top_channels(df_analysis, n_channels=10):
    """
    Analyze the top channels by ticker mention count.
    
    Args:
        df_analysis (pd.DataFrame): Your normalized dataset
        n_channels (int): Number of top channels to display
    """
    # Get top channels by ticker count
    top_channels = df_analysis['channel'].value_counts().head(n_channels).index
    df_top_channels = df_analysis[df_analysis['channel'].isin(top_channels)]
    
    # Calculate channel statistics
    channel_counts = df_top_channels['channel'].value_counts().reset_index()
    channel_counts.columns = ['channel', 'count']
    
    # Find most common ticker per channel
    most_common_ticker = (
        df_top_channels.groupby('channel')['ticker']
        .agg(lambda x: x.value_counts().idxmax())
        .reset_index()
    )
    most_common_ticker.columns = ['channel', 'top_ticker']
    channel_counts = channel_counts.merge(most_common_ticker, on='channel')
    
    print(f"📺 Top {n_channels} Channels by Ticker Mentions")
    for _, row in channel_counts.head(5).iterrows():
        print(f"  {row['channel']}: {row['count']:,} mentions (top ticker: {row['top_ticker']})")
    
    # Plot channel analysis
    plt.figure(figsize=(14, 8))
    ax = sns.barplot(data=channel_counts, y='channel', x='count', color='skyblue')
    
    # Add ticker labels to bars
    for i, (count, ticker) in enumerate(zip(channel_counts['count'], channel_counts['top_ticker'])):
        ax.text(count + max(channel_counts['count'])*0.01, i, ticker, va='center', fontsize=10)
    
    plt.title(f'Top {n_channels} Channels by Ticker Mentions\n(Labeled with Most Common Ticker)')
    plt.xlabel('Mention Count')
    plt.ylabel('Channel')
    plt.tight_layout()
    plt.show()
    
    return channel_counts

# Run the analysis
top_channels = analyze_top_channels(df_analysis, n_channels=8)
Analyze trends over time for top tickers:
def analyze_temporal_trends(df_analysis, start_year=2017, end_year=2024, top_n=5):
    """
    Analyze temporal trends for top tickers.
    
    Args:
        df_analysis (pd.DataFrame): Your normalized dataset
        start_year (int): Start year for analysis
        end_year (int): End year for analysis
        top_n (int): Number of top tickers to track
    """
    # Prepare data
    df_temporal = df_analysis.copy()
    df_temporal['date'] = pd.to_datetime(df_temporal['date'])
    df_temporal['year'] = df_temporal['date'].dt.year
    
    # Filter by year range
    df_temporal = df_temporal[
        (df_temporal['year'] >= start_year) & 
        (df_temporal['year'] <= end_year)
    ]
    
    # Get top tickers per year
    top_per_year = (
        df_temporal.groupby(['year', 'ticker']).size()
        .reset_index(name='count')
        .sort_values(['year', 'count'], ascending=[True, False])
    )
    top_tickers_by_year = top_per_year.groupby('year').head(top_n)
    
    # Plot trends
    plt.figure(figsize=(12, 6))
    for ticker, group in top_tickers_by_year.groupby('ticker'):
        group = group.sort_values('year')
        plt.plot(group['year'], group['count'], marker='o', label=ticker, linewidth=2)
    
    plt.legend(title='Tickers', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.title(f'Top {top_n} Tickers per Year ({start_year}-{end_year}) by Mention Count')
    plt.xlabel('Year')
    plt.ylabel('Mention Count')
    plt.xticks(sorted(df_temporal['year'].unique()))
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.show()
    
    return top_tickers_by_year

# Run temporal analysis
temporal_trends = analyze_temporal_trends(df_analysis, start_year=2018, end_year=2024)

Sentiment Analysis

If your dataset includes sentiment data, analyze sentiment patterns:
def analyze_sentiment_patterns(df_analysis):
    """
    Analyze sentiment patterns in the dataset.
    
    Args:
        df_analysis (pd.DataFrame): Your normalized dataset with sentiment columns
    """
    if 'entity_sentiment_overt_buy_sell' not in df_analysis.columns:
        print("⚠️  Sentiment columns not found in dataset")
        return
    
    # Create sentiment matrix
    sentiment_matrix = df_analysis.groupby([
        'entity_sentiment_overt_buy_sell', 
        'entity_sentiment_generic'
    ]).size().unstack(fill_value=0)
    
    # Plot sentiment heatmap
    plt.figure(figsize=(10, 6))
    sns.heatmap(
        sentiment_matrix, 
        annot=True, 
        fmt='d', 
        cmap='Blues', 
        cbar_kws={'label': 'Count'}
    )
    plt.title('Sentiment Analysis Matrix')
    plt.xlabel('Generic Sentiment')
    plt.ylabel('Buy/Sell Sentiment')
    plt.tight_layout()
    plt.show()
    
    # Print summary
    total_sentiment_records = sentiment_matrix.sum().sum()
    print(f"📊 Sentiment Summary")
    print(f"Total records with sentiment: {total_sentiment_records:,}")
    print(f"Sentiment coverage: {(total_sentiment_records/len(df_analysis))*100:.1f}%")
    
    return sentiment_matrix

# Run sentiment analysis (if sentiment columns exist)
try:
    sentiment_analysis = analyze_sentiment_patterns(df_analysis)
except Exception as e:
    print(f"Sentiment analysis skipped: {e}")

Advanced Analysis Functions

5. Polarizing Tickers Analysis

Identify the most controversial or polarizing tickers based on sentiment patterns:
def analyze_polarizing_tickers(df_analysis, min_mentions=50):
    """
    Analyze which tickers generate the most polarizing sentiment.
    
    Args:
        df_analysis (pd.DataFrame): Your normalized dataset with sentiment columns
        min_mentions (int): Minimum mentions required to include ticker in analysis
        
    Returns:
        pd.DataFrame: DataFrame with polarization metrics
    """
    if 'entity_sentiment_overt_buy_sell' not in df_analysis.columns:
        print("⚠️  Sentiment columns not found in dataset")
        return pd.DataFrame()
    
    sentiment_df = df_analysis.copy()
    
    # Remove rows where ticker is 'NOT_FOUND' or missing
    sentiment_df = sentiment_df[
        (sentiment_df['ticker'] != 'NOT_FOUND') & 
        (sentiment_df['ticker'].notna())
    ]
    
    # Create binary sentiment indicators
    sentiment_df['overt_positive'] = sentiment_df['entity_sentiment_overt_buy_sell'].apply(
        lambda x: 1 if x == 'POSITIVE' else 0
    )
    sentiment_df['overt_negative'] = sentiment_df['entity_sentiment_overt_buy_sell'].apply(
        lambda x: 1 if x == 'NEGATIVE' else 0
    )
    sentiment_df['generic_positive'] = sentiment_df['entity_sentiment_generic'].apply(
        lambda x: 1 if x == 'POSITIVE' else 0
    )
    sentiment_df['generic_negative'] = sentiment_df['entity_sentiment_generic'].apply(
        lambda x: 1 if x == 'NEGATIVE' else 0
    )

    # Aggregate by ticker
    sentiment_agg = sentiment_df.groupby('ticker').agg({
        'overt_positive': 'sum',
        'overt_negative': 'sum',
        'generic_positive': 'sum',
        'generic_negative': 'sum',
        'entity_symbol': 'count'
    }).reset_index()
    
    # Calculate polarization ratios
    sentiment_agg['overt_ratio'] = sentiment_agg[['overt_positive', 'overt_negative']].min(axis=1) / sentiment_agg['entity_symbol']
    sentiment_agg['generic_ratio'] = sentiment_agg[['generic_positive', 'generic_negative']].min(axis=1) / sentiment_agg['entity_symbol']
    
    # Rename mentions column
    sentiment_agg.rename(columns={'entity_symbol': 'mentions_count'}, inplace=True)
    
    # Filter by minimum mentions and sort by polarization
    sentiment_agg = sentiment_agg[sentiment_agg['mentions_count'] >= min_mentions]
    sentiment_agg = sentiment_agg.sort_values(by=['overt_ratio', 'generic_ratio'], ascending=[False, False])
    
    # Get top 10 most polarizing tickers
    top_controversial = sentiment_agg.head(10)
    
    print(f"📊 Most Polarizing Tickers (≥{min_mentions} mentions)")
    for _, row in top_controversial.head(5).iterrows():
        pos_pct = (row['overt_positive'] / row['mentions_count']) * 100
        neg_pct = (row['overt_negative'] / row['mentions_count']) * 100
        print(f"  {row['ticker']}: {pos_pct:.1f}% positive, {neg_pct:.1f}% negative ({row['mentions_count']} total)")
    
    # Plot polarizing tickers
    plt.figure(figsize=(14, 8))
    plt.bar(top_controversial['ticker'], top_controversial['overt_positive'], 
           color='blue', label='Overt Positive')
    plt.bar(top_controversial['ticker'], -top_controversial['overt_negative'], 
           color='red', label='Overt Negative')
    
    plt.title('Most Polarizing Tickers by Sentiment Distribution')
    plt.xlabel('Ticker')
    plt.ylabel('Sentiment Count')
    plt.xticks(rotation=45)
    plt.legend()
    plt.tight_layout()
    plt.show()
    
    return sentiment_agg

# Run polarizing tickers analysis
try:
    polarizing_analysis = analyze_polarizing_tickers(df_analysis, min_mentions=50)
except Exception as e:
    print(f"Polarizing tickers analysis skipped: {e}")

6. Speaker Analysis

Analyze speakers:
# Speaker Analysis
df_speakers = df_visuals_channels.copy()
speakers_and_entity_counts = df_speakers.groupby(['speaker_name', 'speaker_associated_entity', 'speaker_role_context']).size().reset_index(name='count')

# Top hosts (where speaker_role_context == 'host')
top_hosts = speakers_and_entity_counts[speakers_and_entity_counts['speaker_role_context'] == 'host']
top_hosts = top_hosts.sort_values(by='count', ascending=False).head(10)

# Top guests (where speaker_role_context == 'guest')
top_guests = speakers_and_entity_counts[speakers_and_entity_counts['speaker_role_context'] == 'guest']
top_guests = top_guests.sort_values(by='count', ascending=False).head(10)

# Top quoted clips (where speaker_role_context == 'quoted clip')
top_quoted_clips = speakers_and_entity_counts[speakers_and_entity_counts['speaker_role_context'] == 'quoted clip']
top_quoted_clips = top_quoted_clips.sort_values(by='count', ascending=False).head(10)

# Graph each of these top speaker categories
plt.figure(figsize=(14, 8))

# Plot the top hosts
plt.subplot(3, 1, 1)
plt.barh(top_hosts['speaker_name'], top_hosts['count'], color='skyblue')
plt.title('Top 10 Hosts by Count')
plt.xlabel('Count')
plt.ylabel('Host Name')
plt.xticks(rotation=45)
plt.tight_layout()

plt.subplot(3, 1, 2)
# Plot the top guests
plt.barh(top_guests['speaker_name'], top_guests['count'], color='skyblue')
plt.title('Top 10 Guests by Count')
plt.xlabel('Count')
plt.ylabel('Guest Name')
plt.xticks(rotation=45)
plt.tight_layout()

plt.subplot(3, 1, 3)
# Plot the top quoted clips
plt.barh(top_quoted_clips['speaker_name'], top_quoted_clips['count'], color='skyblue')
plt.title('Top 10 Quoted Clips by Count')
plt.xlabel('Count')
plt.ylabel('Quoted Clip Name')
plt.xticks(rotation=45)
plt.tight_layout()

plt.show()

# Show top 20 guests with their associated entity names
top_guests = speakers_and_entity_counts[speakers_and_entity_counts['speaker_role_context'] == 'guest']

# Add the entity name to the guest name in a separate column
top_guests['guest_with_entity'] = top_guests['speaker_name'] + ' (' + top_guests['speaker_associated_entity'].str.slice(0, 25) + ')'

# Sort by count and take the top 20
top_guests = top_guests.sort_values(by='count', ascending=False).head(20)

# Plot the top 20 guests with their associated entity names
plt.figure(figsize=(14, 8))
plt.barh(top_guests['guest_with_entity'], top_guests['count'], color='skyblue')
plt.title('Top 20 Guests by Count with Associated Entity Names')
plt.xlabel('Count')
plt.ylabel('Guest Name (Associated Entity)')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# Unique counts of guests and hosts
vid_count = df_visuals_channels['video_id'].nunique()

# Get the unique counts of guests and hosts
unique_guests = speakers_and_entity_counts[speakers_and_entity_counts['speaker_role_context'] == 'guest']['speaker_name'].nunique()
unique_hosts = speakers_and_entity_counts[speakers_and_entity_counts['speaker_role_context'] == 'host']['speaker_name'].nunique()

unique_hosts_ratio = unique_hosts / (vid_count)
unique_guests_ratio = unique_guests / (vid_count)

print(f"Unique Hosts:  {unique_hosts} ({unique_hosts_ratio:.1%})")
print(f"Unique Guests: {unique_guests} ({unique_guests_ratio:.1%})")

7. Ticker-Channel Relationships

Find which channels mentioned specific tickers most:
# For a given ticker, which channel mentioned it the most 
ticker_queries = [
    "CMCSA",
    "ULTA", 
    "TSCO"
]

df_plt6 = df_visuals_channels.copy()
df_plt6 = df_plt6[(df_plt6['ticker'] != 'NOT_FOUND') & (df_plt6['ticker'].notna())]

for ticker_query in ticker_queries:
    # Get the channel that mentioned the ticker the most
    df_ticker = df_plt6[df_plt6['ticker'] == ticker_query]

    top_channel = df_ticker['channel'].value_counts().idxmax()
    mention_count = df_ticker['channel'].value_counts().max()

    print(f"Channel that mentioned {ticker_query} the most: {top_channel} ({mention_count} times)")

Year-Based Analysis

For a given year, list the top tickers and channels:
# Customizable variables
target_year = 2022
top_n = 5

df_plt5 = df_visuals_channels.copy()

# Filter the data to include only data from target_year
df_plt5['date'] = pd.to_datetime(df_plt5['date'])
df_plt5['year'] = df_plt5['date'].dt.year
df_year = df_plt5[df_plt5['year'] == target_year]

# Top N tickers
top_tickers = df_year['ticker'].value_counts().head(top_n).index
df_top_tickers = df_year[df_year['ticker'].isin(top_tickers)]

# Get top_n channels mentioning these
top_channels = df_top_tickers['channel'].value_counts().head(10)

print("Top Tickers:")
print(df_year['ticker'].value_counts().head(top_n))

print("\nTop 10 Channels mentioning those tickers:")
print(top_channels)

Next Steps

With your summary statistics generated, you can:
  1. YouTube Core Dataset Overview - Learn about entity-focused analysis features
  2. YouTube Extended Dataset Overview - Discover complete transcript analysis capabilities
  3. Data Dictionaries - Complete field definitions for both datasets
Save your analysis code in a Jupyter notebook for easy reuse and sharing with your team.
I