from otree.api import * import pandas as pd import numpy as np import re import itertools import urllib.parse import os import json doc = """ Mimic social media feeds with DICE. """ class C(BaseConstants): NAME_IN_URL = 'DICE' PLAYERS_PER_GROUP = None NUM_ROUNDS = 1 CONSENT_TEMPLATE = "DICE/T_Consent.html" TOPICS_TEMPLATE = "DICE/T_Trending_Topics.html" ITEM_POST = "DICE/T_Item_Post.html" class Subsession(BaseSubsession): feed_conditions = models.StringField(doc='indicates the feed condition a player is randomly assigned to') class Group(BaseGroup): pass class Player(BasePlayer): feed_condition = models.StringField(doc='indicates the feed condition a player is randomly assigned to') feed_toxicity = models.FloatField(doc='mean toxicity score of the feed shown to this player', blank=True) sequence = models.StringField(doc='prints the sequence of posts based on doc_id') dwell_data = models.LongStringField(doc='tracks the time feed items were visible in a participants viewport.', blank=True) focal_line_data = models.LongStringField(doc='tracks cumulative time each post spent covering the focal line.', blank=True) rowheight_data = models.LongStringField(doc='tracks the height of feed items in pixels.', blank=True) likes_data = models.LongStringField(doc='tracks likes.', blank=True) replies_data = models.LongStringField(doc='tracks replies.', blank=True) lottery_signup = models.BooleanField(doc='indicates whether the participant entered the Disney+ lottery draw.', blank=True) time_on_feed = models.FloatField(doc='seconds spent browsing the feed, from preloader hide to submit click.', blank=True) creative_image = models.LongStringField(doc='JSON mapping of doc_id to creative media path for posts with assigned creatives.', blank=True) is_touch_device = models.BooleanField(doc="indicates whether a participant uses a touch device to access survey.", blank=True) device_type = models.StringField(doc="indicates the participant's device type based on screen width.", blank=True) screen_resolution = models.StringField(doc="indicates the participant's screen resolution, i.e., width x height.", blank=True) # FUNCTIONS ----- def creating_session(subsession): # Load and preprocess data once but shuffle and assign for each player df = read_feed(path=subsession.session.config['data_path'], delim=subsession.session.config['delimiter']) processed_posts = preprocessing(df, subsession.session.config) # Check if the file contains any conditions and assign groups to it condition = subsession.session.config['condition_col'] if condition in processed_posts.columns: feed_conditions = itertools.cycle(processed_posts[condition].unique()) subsession.feed_conditions = ', '.join(processed_posts[condition].unique()) players = subsession.get_players() feed_size = subsession.session.config.get('feed_size', 0) # Load creative images if available creatives_path = subsession.session.config.get('creatives_path', '') creatives = [] creatives_are_local = False if creatives_path: if os.path.isdir(creatives_path): image_exts = {'.jpg', '.jpeg', '.png', '.webp', '.gif'} folder = os.path.basename(creatives_path.rstrip('/\\')) creatives = [ f'{folder}/{f}' for f in sorted(os.listdir(creatives_path)) if os.path.splitext(f)[1].lower() in image_exts ] creatives_are_local = True else: try: creatives = pd.read_csv(creatives_path)['url'].dropna().tolist() except Exception: pass num_creatives = subsession.session.config.get('num_creatives', 5) # Prepare uniformly spaced toxicity targets if feed_size is configured toxicity_targets = None if feed_size > 0 and 'toxicity' in processed_posts.columns: n_players = len(players) tox_min = processed_posts['toxicity'].min() tox_max = processed_posts['toxicity'].max() toxicity_targets = np.linspace(tox_min, tox_max, n_players) np.random.shuffle(toxicity_targets) for i, player in enumerate(players): # Deep copy the DataFrame to ensure each player gets a unique shuffled version posts = processed_posts.copy() # Assign a condition to the player if conditions are present if condition in posts.columns: player.feed_condition = next(feed_conditions) condition_mask = posts[condition] == player.feed_condition # Always keep fixed-position posts regardless of their condition value if 'fixed_position' in posts.columns: condition_mask = condition_mask | posts['fixed_position'].notna() posts = posts[condition_mask] # Sample a subset of posts by toxicity if configured if toxicity_targets is not None: posts = sample_feed_by_toxicity(posts, toxicity_targets[i], feed_size) regular = posts[posts['fixed_position'].isna()] if 'fixed_position' in posts.columns else posts player.feed_toxicity = float(round(regular['toxicity'].mean(), 4)) # Shuffle post order and assign sequential ranks posts = posts.sample(frac=1).reset_index(drop=True) posts['sequence'] = np.arange(1, len(posts) + 1) # Randomly attach creative images to a subset of posts if creatives and num_creatives > 0: img_indices = np.random.choice(len(posts), size=min(num_creatives, len(posts)), replace=False) chosen_urls = np.random.choice(creatives, size=len(img_indices), replace=len(img_indices) > len(creatives)) posts.loc[img_indices, 'media'] = chosen_urls posts.loc[img_indices, 'pic_available'] = True posts.loc[img_indices, 'media_is_local'] = creatives_are_local player.creative_image = json.dumps({ str(row['doc_id']): row['media'] for _, row in posts.loc[img_indices].iterrows() }) # Assign processed posts to player-specific variable # (participant field kept as 'tweets' for backward-compatibility with existing databases) player.participant.tweets = posts # Record the sequence for each player player.sequence = ', '.join(map(str, posts['doc_id'].tolist())) def read_feed(path, delim): if re.match(r'^https?://\S+', path): if 'github' in path: posts = pd.read_csv(path, sep=delim) elif 'drive.google.com' in path: if '/uc?' in path: # Already in the correct format posts = pd.read_csv(path, sep=delim) else: # Convert from /file/d/ format file_id = path.split('/')[-2] download_url = f'https://drive.google.com/uc?id={file_id}' posts = pd.read_csv(download_url, sep=delim) else: raise ValueError("Unrecognized URL format") else: posts = pd.read_csv(path, sep=delim) return posts def format_dates(df): """Parse and format date columns.""" df['datetime'] = pd.to_datetime(df['datetime'], errors='coerce', format='mixed') mask = df['datetime'].isna() if mask.any(): df.loc[mask, 'datetime'] = pd.to_datetime( df.loc[mask, 'datetime'], errors='coerce', format='%d.%m.%y %H:%M' ) df['date'] = df['datetime'].dt.strftime('%d %b').str.replace(' ', '. ') df['date'] = df['date'].str.lstrip('0') df['formatted_datetime'] = df['datetime'].dt.strftime('%I:%M %p ยท %b %d, %Y') return df def highlight_entities(df): """Highlight hashtags, cashtags, mentions, and URLs in post text.""" df['text'] = df['text'].str.replace(r'\B(\#[a-zA-Z0-9_]+\b)', r'\g<0>', regex=True) df['text'] = df['text'].str.replace(r'\B(\$[a-zA-Z0-9_\.]+\b)', r'\g<0>', regex=True) df['text'] = df['text'].str.replace(r'\B(\@[a-zA-Z0-9_]+\b)', r'\g<0>', regex=True) # remove the href below, if you don't want them to leave your page df['text'] = df['text'].str.replace( r'(http|ftp|https):\/\/([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:\/~+#-]*[\w@?^=%&\/~+#-])', r'\g<0>', regex=True) return df def prepare_numeric_fields(df): """Convert replies/reposts/likes to int, filling NAs with 0.""" df['replies'] = df['replies'].fillna(0).astype(int) df['reposts'] = df['reposts'].fillna(0).astype(int) df['likes'] = df['likes'].fillna(0).astype(int) return df def prepare_media(df): """Clean media URLs and set pic_available flag.""" df['media'] = df['media'].astype(str).str.replace("'|,", '', regex=True) # Any non-empty, non-nan value means media is available (URL or local path) df['pic_available'] = df['media'].apply( lambda m: bool(m and m.strip() and m not in ('nan', 'None', '')) ) # Local if pic is available but not a URL df['media_is_local'] = df['pic_available'] & ~df['media'].str.startswith('http') return df def prepare_user_profiles(df): """Prepare profile pics, icons, colors, descriptions, followers, and tooltip HTML.""" df['profile_pic_available'] = False df['icon'] = df['username'].str[:2].str.title() # Assign a deterministic color class based on username hash color_classes = ['p1', 'p2', 'p3', 'p4', 'p5', 'p6', 'p7', 'p8'] df['color_class'] = df['username'].apply( lambda name: color_classes[hash(name) % len(color_classes)] ) # make sure user descriptions do not entail any '' or "" as this complicates visualization # also replace nan with some whitespace df['user_description'] = df['user_description'].str.replace("'", '') df['user_description'] = df['user_description'].str.replace('"', '') df['user_description'] = df['user_description'].fillna(' ') # make number of followers a formatted string df['user_followers'] = df['user_followers'].map('{:,.0f}'.format).str.replace(',', '.') # Build tooltip HTML once per row df['tooltip_html'] = ( "