# Importing necessary libraries for data manipulation and visualization
import pandas as pd        # Pandas for data manipulation and analysis
import numpy as np         # NumPy for numerical computing with arrays
import matplotlib.pyplot as plt  # Matplotlib's pyplot for creating static, animated, and interactive visualizations
import seaborn as sns      # Seaborn for statistical data visualization based on Matplotlib

# Handling warnings in Python
import warnings            # Import warnings module to manage warning messages

# Adding more specific comments about warnings
# This line will suppress all warnings of type UserWarning
# UserWarning is often raised in situations where the issue can be safely ignored
# This can help in making the notebook output cleaner and more readable
warnings.filterwarnings('ignore', category=UserWarning)

!python --version

Python 3.9.18

Example: `2020-02-concated_df-emo.csv` - Sentiment-labelled tweets from February 2020.

# Importing the os module to interact with the operating system
import os

# Setting the directory where the raw dataset is stored
dataset_directory = 'dataset'

# Initializing lists to hold the file paths of CSV and GeoJSON files
csv_files = []          # List for storing paths of CSV files
geojson_files = []      # List for storing paths of GeoJSON files

# Walking through each directory and subdirectory in the dataset_directory
for root, dirs, files in os.walk(dataset_directory):
    # Looping through each file in the current directory
    for file in files:
        # If the file ends with .csv, add it to the csv_files list
        if file.endswith('.csv'):
            csv_files.append(os.path.join(root, file))
        # Similarly, if the file ends with .geojson, add it to geojson_files list
        if file.endswith('.geojson'):
            geojson_files.append(os.path.join(root, file))

# Printing the paths of all CSV files found in the dataset directory
for csv_file in csv_files:
    print(csv_file)

# Printing the paths of all GeoJSON files found in the dataset directory
for geojson_file in geojson_files:
    print(geojson_file)

dataset/2020-02-concated_df-emo.csv
dataset/2020-03-concated_df.csv
dataset/2020-05-concated_df.csv
dataset/2020-01-concated_df.csv
dataset/2020-04-concated_df-emo.csv
dataset/2020-03-concated_df-emo.csv
dataset/2020-02-concated_df.csv
dataset/2020-05-concated_df-emo.csv
dataset/2020-01-concated_df-emo.csv
dataset/2020-04-concated_df.csv
dataset/2020-03-concated_df.geojson
dataset/2020-05-concated_df.geojson
dataset/2020-04-concated_df.geojson
dataset/2020-01-concated_df.geojson
dataset/2020-02-concated_df.geojson

# Load the CSV file
df = pd.read_csv('dataset/2020-01-concated_df.csv')

# Display the first few rows of the dataframe to understand its structure
df.head()

# Display the columns of the dataframe
print(df.columns)
# introduce the attributes names

Index(['created_at', 'id', 'full_text', 'cleaned_text', 'entities',
       'retweet_count', 'favorite_count', 'CountyId', 'user_name',
       'user_followers_count', 'user_friends_count', 'user_listed_count',
       'favourites_count', 'user_location', 'geo'],
      dtype='object')

# Analyzing missing data in the DataFrame

# Calculate the number of missing values for each column in the DataFrame
# df.isnull() creates a boolean mask where True indicates missing values
# .sum() then sums up these True values column-wise, giving the total count of missing values per column
missing_values = df.isnull().sum()

# Plotting the missing values

# Setting up the figure size for the plot for better visibility and aesthetics
plt.figure(figsize=(10, 6))

# Creating a bar plot using seaborn
# x=missing_values.index provides the column names for the x-axis
# y=missing_values.values provides the corresponding counts of missing values for the y-axis
sns.barplot(x=missing_values.index, y=missing_values.values)

# Rotating the x-axis labels by 90 degrees to make them readable as there can be many columns
plt.xticks(rotation=90)

# Setting the x-axis label to 'Columns'
plt.xlabel('Columns')

# Setting the y-axis label to 'Number of Missing Values'
plt.ylabel('Number of Missing Values')

# Setting the title of the plot for better understanding of what the plot represents
plt.title('Missing Values in Each Column')

# Displaying the plot
plt.show()

# Group by user and count the number of posts
user_post_counts = df.groupby('user_name').size().sort_values(ascending=False)

# Select the top 40 users
top_40_users = user_post_counts.head(40)

# Plotting
plt.figure(figsize=(12, 8))
sns.barplot(x=top_40_users.values, y=top_40_users.index)
plt.xlabel('Number of Posts')
plt.ylabel('User Names')
plt.title('Top 40 Users by Number of Posts')
plt.show()

# Identify the most active user
most_active_user = user_post_counts.idxmax()

# Optional: Filter out the most active user from the dataset
# df = df[df['user_name'] != most_active_user]

import pandas as pd

# List of file names
file_names = [
    'dataset/2020-01-concated_df.csv',
    'dataset/2020-02-concated_df.csv',
    'dataset/2020-03-concated_df.csv',
    'dataset/2020-04-concated_df.csv',
    'dataset/2020-05-concated_df.csv'
]

# Load and concatenate the DataFrames
df_list = [pd.read_csv(file) for file in file_names]
large_df = pd.concat(df_list)

import pandas as pd
import time
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
from multiprocessing import Pool, cpu_count

# Load the model and tokenizer
tokenizer = AutoTokenizer.from_pretrained("finiteautomata/bertweet-base-sentiment-analysis")
model = AutoModelForSequenceClassification.from_pretrained("finiteautomata/bertweet-base-sentiment-analysis")
pipe = pipeline("text-classification", model=model, tokenizer=tokenizer)

def apply_sentiment(text):
    try:
        result = pipe([text], truncation=True, padding=True)
        return result[0]['label']
    except Exception as e:
        return None

# Function to process each chunk of the DataFrame
def process_chunk(chunk):
    chunk['sentiment'] = chunk['cleaned_text'].apply(apply_sentiment)
    return chunk

# Main processing function
def parallelize_dataframe_processing(df, func, n_cores):
    df_split = np.array_split(df, n_cores)
    pool = Pool(n_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

# Parallelize sentiment analysis
n_cores = cpu_count()  # Or set this to the number of cores you want to use
print(n_cores, "CPU can use.")

start_time = time.time()
df = parallelize_dataframe_processing(large_df, process_chunk, 2) # increase the cpu count to speedup
end_time = time.time()

# Calculate and print the total time taken
total_time = end_time - start_time
print(f"Total time for sentiment analysis: {total_time} seconds")

emoji is not installed, thus not converting emoticons or emojis into text. Install emoji: pip3 install emoji==0.6.0

64 CPU can use.
Total time for sentiment analysis: 429.2285463809967 seconds

import matplotlib.pyplot as plt
import seaborn as sns

# Assuming 'sentiment' is the column with sentiment analysis results
sentiment_counts = df['sentiment'].value_counts()

# Plotting
plt.figure(figsize=(8, 6))
sns.barplot(x=sentiment_counts.index, y=sentiment_counts.values)
plt.xlabel('Sentiment')
plt.ylabel('Number of Tweets')
plt.title('Distribution of Sentiment Analysis Results')
plt.show()

import pandas as pd

# Fill missing values with a placeholder (e.g., -1) and handle it in later analysis
df['CountyId'] = df['CountyId'].fillna(-1)

# Convert to integers
df['CountyId'] = df['CountyId'].astype(int)

# Convert to string
df['CountyId'] = df['CountyId'].astype(str)

# Check the conversion
print(df['CountyId'].head())

0    6085
1    6075
2    6081
3    6075
4      -1
Name: CountyId, dtype: object

import geopandas as gpd
import matplotlib.pyplot as plt

# Load geospatial data
geo_data = gpd.read_file('geojson-counties-fips.json')

# Convert 'CountyId' in geo_data to string (if necessary)
geo_data['CountyId'] = geo_data['id'].astype(str)

# Define bounding box coordinates for the contiguous United States
minx, miny, maxx, maxy = -124.848974, 24.396308, -66.885444, 49.384358

# Filter the GeoDataFrame
geo_data = geo_data.cx[minx:maxx, miny:maxy]

# Merge your DataFrame with the geospatial data
merged_data = geo_data.merge(df, on='CountyId', how='left')

# Plotting
fig, ax = plt.subplots(1, 1, figsize=(15, 10))

# Plot base layer of all counties
geo_data.plot(ax=ax, color='lightgrey')

# Overlay with sentiment data (choose a column for color-coding)
merged_data.plot(column='sentiment', ax=ax, legend=True, categorical=True, alpha=0.7)

# Add titles and labels as necessary
plt.title('Sentiment Analysis by County')
plt.show()

import pandas as pd

# List of file names
file_names = [
    'dataset/2020-01-concated_df.csv',
    'dataset/2020-02-concated_df.csv',
    'dataset/2020-03-concated_df.csv',
    'dataset/2020-04-concated_df.csv',
    'dataset/2020-05-concated_df.csv'
]

# Load and concatenate the DataFrames
df_list = [pd.read_csv(file) for file in file_names]
combined_df = pd.concat(df_list)

# Ensure 'CountyId' is a string (if you're planning to merge with GeoDataFrame later)
combined_df['CountyId'] = combined_df['CountyId'].astype(str)

import pandas as pd

# List of file names
file_names = [
    'dataset/2020-01-concated_df.csv',
    'dataset/2020-02-concated_df.csv',
    'dataset/2020-03-concated_df.csv',
    'dataset/2020-04-concated_df.csv',
    'dataset/2020-05-concated_df.csv'
]

# Load and concatenate the DataFrames
df_list = [pd.read_csv(file) for file in file_names]
combined_df = pd.concat(df_list)

# Initialize the pipeline
pipe = pipeline("text-classification", model="tanoManzo/minilm-finetuned-mimic-race-ethnicity-multi-label", return_all_scores=True)

# You need a function that processes the pipeline's output to extract the highest-scoring ethnicity and race labels, excluding the specified labels.
def get_ethnicity_race(username):
    # Filter and process if the username is a non-empty string
    if isinstance(username, str) and username.strip():
        try:
            results = pipe(username)
            # Extract ethnicity and race results
            ethnicity = max((r for r in results[0] if r['label'].startswith('ETH') and 
                             'No Information' not in r['label'] and 
                             'Not Covered' not in r['label']), 
                            key=lambda x: x['score'], 
                            default=None)
            race = max((r for r in results[0] if r['label'].startswith('RACE') and 
                        'No Information' not in r['label'] and 
                        'Not Covered' not in r['label']), 
                       key=lambda x: x['score'], 
                       default=None)

            return {
                'Ethnicity': ethnicity['label'] if ethnicity else None,
                'Race': race['label'] if race else None
            }
        except Exception as e:
            return {'Ethnicity': None, 'Race': None}
    else:
        return {'Ethnicity': None, 'Race': None}

# Assuming df is your DataFrame and 'user_name' is the column with usernames
results = combined_df['user_name'].apply(get_ethnicity_race)
combined_df['Predicted_Ethnicity'] = results.apply(lambda x: x['Ethnicity'])
combined_df['Predicted_Race'] = results.apply(lambda x: x['Race'])

# Calculate percentage distribution for Ethnicity
ethnicity_counts = combined_df['Predicted_Ethnicity'].value_counts(normalize=True) * 100
# Calculate percentage distribution for Race
race_counts = combined_df['Predicted_Race'].value_counts(normalize=True) * 100

# Group by 'CountyId' and count the number of entries in each group
county_counts = combined_df.groupby('CountyId').size()

# Sort the counts in descending order and select the top three
top_counties = county_counts.sort_values(ascending=False).head(1000)

# Get the CountyId of the top three counties
top_county_ids = top_counties.index

# Select rows from combined_df where CountyId is one of the top three
selected_counties = combined_df[combined_df['CountyId'].isin(top_county_ids)]

selected_counties['CountyId'] = selected_counties['CountyId'].fillna(-1)
selected_counties['CountyId'] = selected_counties['CountyId'].astype(int)
selected_counties['CountyId'] = selected_counties['CountyId'].astype(str)

selected_counties_with_name = selected_counties.merge(geo_data[['CountyId', 'NAME']], on='CountyId', how='left')
selected_counties_with_name.rename(columns={'NAME': 'county_name'}, inplace=True)
selected_counties_with_name = selected_counties_with_name.dropna(subset=['county_name'])

filtered_counties = selected_counties_with_name[
    selected_counties_with_name['county_name'].isin(['Miami-Dade', 'Clark'])
]

import pandas as pd

# Group by ethnicity and sentiment and count occurrences
grouped_data = filtered_counties.groupby(['Predicted_Ethnicity', 'county_name']).size().reset_index(name='count')

# Filter for only Hispanic and Non-Hispanic groups
filtered_data = grouped_data[grouped_data['Predicted_Ethnicity'].isin(['ETH Hispanic/Latino/Latina/Latinx', 
                                                                       'ETH Non-Hispanic/Non-Latino/Non-Latina/Non-Latinx'])]


# Calculate the total count for each county
total_count_by_county = filtered_data.groupby('county_name')['count'].sum()

# Calculate the percentage for each row
filtered_data['percentage'] = filtered_data.apply(lambda row: (row['count'] / total_count_by_county[row['county_name']]) * 100, axis=1)

# Set the figure size and create the bar plot
plt.figure(figsize=(10, 6))
barplot = sns.barplot(x='county_name', y='percentage', hue='Predicted_Ethnicity', data=filtered_data)

# Set the title and labels for the plot
plt.title('Ethnicity Distribution among Counties (Percentage)')
plt.xlabel('County Name')
plt.ylabel('Percentage')
plt.legend(title='Ethnicity')

# Add percentage annotations on each bar
for p in barplot.patches:
    height = p.get_height()
    # Only add annotations if the height is not NaN and greater than 0
    if not pd.isna(height) and height > 0:
        barplot.annotate(f'{height:.1f}%', 
                         (p.get_x() + p.get_width() / 2., height), 
                         ha='center', va='center', 
                         xytext=(0, 9), 
                         textcoords='offset points')

plt.xticks(rotation=45)  # Rotate the x-axis labels for better readability
plt.show()

# Assuming 'Predicted_Race' and 'other_attribute' e.g. sentiment columns are in your DataFrame
grouped_data = df.groupby(['Predicted_Race', 'other_attribute']).size().reset_index(name='count')

# Calculate total counts for each race group
total_counts = grouped_data.groupby('Predicted_Race')['count'].sum()

# Calculate percentages
grouped_data['percentage'] = grouped_data.apply(lambda x: (x['count'] / total_counts[x['Predicted_Race']]) * 100, axis=1)

# Plotting
# (Your code for the bar plot goes here, similar to the previous ethnicity exercise)

	created_at	id	full_text	cleaned_text	entities	retweet_count	favorite_count	CountyId	user_name	user_followers_count	user_friends_count	user_listed_count	favourites_count	user_location	geo
0	2020-01-25 21:37:21+00:00	1221185310316322816	Getting ready. Many stores are sold out of med...	getting ready many stores sold medical facemas...	{'hashtags': [{'text': 'FaceMasks', 'indices':...	0	2	6085.0	William Bender, FCSI	6639	6466	361	16473	San Jose, California	[-121.8865, 37.3376]
1	2020-01-31 02:52:08+00:00	1223076466801250304	Thoughts? San Francisco :: As coronavirus fear...	thoughts san francisco fears mount fewer fligh...	{'hashtags': [{'text': 'GraceShi', 'indices': ...	0	0	6075.0	Shaun Haines 力是亮	1	4970	0	7468	San Francisco, CA	[-122.4190191, 37.7485909]
2	2020-01-31 02:19:03+00:00	1223068144933097472	San Francisco :: San Francisco :: Trump under ...	san francisco san francisco trump growing pres...	{'hashtags': [], 'symbols': [], 'user_mentions...	0	0	6081.0	Shaun Haines 力是亮	1	4970	0	7468	San Francisco, CA	[-122.38733002, 37.73465621]
3	2020-01-25 00:21:02+00:00	1220864116912345088	Thoughts? San Francisco :: Life science compan...	thoughts san francisco life science companies ...	{'hashtags': [], 'symbols': [], 'user_mentions...	0	0	6075.0	Shaun Haines 力是亮	1	4970	0	7468	San Francisco, CA	[-122.3958528, 37.7929728]
4	2020-01-25 00:38:17+00:00	1220868458314944512	She was only 16. We’re heartbroken, we’re furi...	’ heabroken ’ furious ’ stop thinking loved on...	{'hashtags': [{'text': 'MMIW', 'indices': [143...	9	26	NaN	B . YE L L O W T A I L	6233	464	75	3326	NaN	[-107.613, 45.7318]

Learning Objectives
¶

Module 1. Data Collection and Processing
¶

Step 1: Import libraries
¶

Step 2: Load datasets
¶

Step 3: Handle missing data
¶

Step 4: Analyse and visualize the data (e.g., top 40 users)
¶

Step 1: Import Libraries¶

Step 2: load datasets¶

Dataset Overview¶

1. Sentiment-Labelled CSV Files (`-emo.csv`)¶

2. Geotagged Data in GeoJSON Files¶

Dataset Columns Explanation¶

Step 3. Handle Missing Data ¶

Step 4: Analyse and visualize the data (e.g., top 40 users) ¶

Take away: Why identify the most active users? ¶

Module 2. Sentiment Analysis
¶

Step 1: Import libraries and load DataFrames
¶

Step 2: Load model and tokenizer
¶

Step 3: Handle the data intensive issue
¶

Step 4: Visuaize the sentiment analysis results
¶

Step 1: Import Libraries and load dataframes ¶

Step 2: Load model and tokenizer ¶

Step 3. Handle the data intensive issue ¶

Step 4: Visualize the sentiment analysis results ¶

Visaulzie the data on the map¶

Exercise: Aggregating and Plotting Sentiment Data¶

Steps:¶

Code Example:¶

Step 1: Ethnicity Prediction
¶

Step 2: Demographic Analysis
¶

Introduction¶

Step 1. Ethnicity Prediction ¶

Objective¶

Task Description¶

Code Snippet¶

Reflection¶

References:¶

Social Sensing Using NSF ACCESS High Performance Computing¶

Learning Objectives¶

Module 1. Data Collection and Processing ¶

Step 1: Import libraries ¶

Step 2: Load datasets ¶

Step 3: Handle missing data ¶

Step 4: Analyse and visualize the data (e.g., top 40 users) ¶

Step 1: Import Libraries¶

Step 2: load datasets¶

Dataset Overview¶

1. Sentiment-Labelled CSV Files (-emo.csv)¶

2. Geotagged Data in GeoJSON Files¶

Dataset Columns Explanation¶

Step 3. Handle Missing Data ¶

Step 4: Analyse and visualize the data (e.g., top 40 users) ¶

Take away: Why identify the most active users? ¶

Module 2. Sentiment Analysis ¶

Step 1: Import libraries and load DataFrames ¶

Step 2: Load model and tokenizer ¶

Step 3: Handle the data intensive issue ¶

Step 4: Visuaize the sentiment analysis results ¶

Step 1: Import Libraries and load dataframes ¶

Step 2: Load model and tokenizer ¶

Step 3. Handle the data intensive issue ¶

Step 4: Visualize the sentiment analysis results ¶

Visaulzie the data on the map¶

Exercise: Aggregating and Plotting Sentiment Data¶

Steps:¶

Code Example:¶

Module 4. Geoethics in Social Sensing ¶

Step 1: Ethnicity Prediction ¶

Step 2: Demographic Analysis ¶

Introduction¶

Step 1. Ethnicity Prediction ¶

Exercise: Identifying race of the social media users¶

Objective¶

Task Description¶

Code Snippet¶

Reflection¶

References:¶

Social Sensing Using NSF ACCESS High Performance Computing
¶

Learning Objectives
¶

Module 1. Data Collection and Processing
¶

Step 1: Import libraries
¶

Step 2: Load datasets
¶

Step 3: Handle missing data
¶

Step 4: Analyse and visualize the data (e.g., top 40 users)
¶

1. Sentiment-Labelled CSV Files (`-emo.csv`)¶

Module 2. Sentiment Analysis
¶

Step 1: Import libraries and load DataFrames
¶

Step 2: Load model and tokenizer
¶

Step 3: Handle the data intensive issue
¶

Step 4: Visuaize the sentiment analysis results
¶

Module 4. Geoethics in Social Sensing
¶

Step 1: Ethnicity Prediction
¶

Step 2: Demographic Analysis
¶