Skip to content

Commit 5877729

Browse files
committed
add token score results
1 parent a5dc054 commit 5877729

11 files changed

Lines changed: 12152 additions & 31 deletions

linear-probes/all_layers_cache_train_eval.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
import probes
2828
from pprint import pprint as pp
2929

30-
from datasets import AmongUsDataset, TruthfulQADataset, DishonestQADataset, RepEngDataset, RolePlayingDataset, ApolloProbeDataset
30+
from probe_datasets import AmongUsDataset, TruthfulQADataset, DishonestQADataset, RepEngDataset, RolePlayingDataset, ApolloProbeDataset
3131
from configs import config_phi4, config_gpt2, config_llama3
3232
base_config = config_phi4
3333
amongus_expt_name: str = "2025-02-01_phi_phi_100_games_v3"

linear-probes/cache_activations.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
for module in [datasets, plots, configs, evaluate_utils]:
1212
importlib.reload(module)
1313

14-
from datasets import AmongUsDataset, TruthfulQADataset, DishonestQADataset, RepEngDataset, RolePlayingDataset, ApolloProbeDataset
14+
from probe_datasets import AmongUsDataset, TruthfulQADataset, DishonestQADataset, RepEngDataset, RolePlayingDataset, ApolloProbeDataset
1515
from configs import config_phi4, config_gpt2, config_llama3
1616

1717
def main(dataset_name: str):

linear-probes/evaluate_probes.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
from typing import Dict, Any, List, Tuple
1616
from sklearn.metrics import roc_curve, auc, precision_recall_curve, average_precision_score
1717

18-
from datasets import TruthfulQADataset, DishonestQADataset, AmongUsDataset, RolePlayingDataset, RepEngDataset
18+
from probe_datasets import TruthfulQADataset, DishonestQADataset, AmongUsDataset, RolePlayingDataset, RepEngDataset
1919
from evaluate_utils import evaluate_probe_on_activation_dataset
2020
from configs import config_phi4, config_gpt2, config_llama3
2121
from plots import plot_behavior_distribution, plot_roc_curves, add_roc_curves, print_metrics, plot_roc_curve_eval

linear-probes/evaluate_utils.py

Lines changed: 14 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ def evaluate_probe_on_dataset(test_df, model, tokenizer, probe, dataset, device,
7575
print(f"Accuracy: {accuracy}")
7676
return av_probe_outputs, accuracy
7777

78-
def evaluate_probe_on_activation_dataset(chunk_data, probe, device, num_tokens=None, verbose=True):
78+
def evaluate_probe_on_activation_dataset(chunk_data, probe, device, num_tokens=None, verbose=True, output_token_scores=False):
7979
"""
8080
Evaluate probe on a test dataset of activations and return probe outputs and accuracy
8181
@@ -88,34 +88,41 @@ def evaluate_probe_on_activation_dataset(chunk_data, probe, device, num_tokens=N
8888
verbose: Whether to print progress
8989
9090
Returns:
91-
tuple: (average probe outputs, accuracy)
91+
tuple: (average probe outputs, accuracy, probe outputs) if output_token_scores is True, otherwise (average probe outputs, accuracy)
9292
"""
9393
av_probe_outputs = []
94+
all_probe_outputs = []
9495
total, correct = 0, 0
9596

96-
for i, (activations, label) in enumerate(chunk_data):
97+
from tqdm import tqdm
98+
99+
for i, (activations, label) in enumerate(tqdm(chunk_data)):
97100
total += 1
98101

99102
# Take specified number of tokens from end of sequence
100103
acts_to_use = activations[-num_tokens:] if num_tokens else activations
101104

102105
# Get probe output for each token position
103-
probe_outputs = [round(probe.evaluate_single_activation(t.tensor(act, device=device)), 4)
106+
token_probe_outputs = [round(probe.evaluate_single_activation(t.tensor(act, device=device)), 4)
104107
for act in acts_to_use]
105-
avg_probe_output = sum(probe_outputs) / len(probe_outputs)
108+
avg_probe_output = sum(token_probe_outputs) / len(token_probe_outputs)
106109

107110
if label == 1 and avg_probe_output > 0.5:
108111
correct += 1
109112
elif label == 0 and avg_probe_output <= 0.5:
110113
correct += 1
111114

112115
av_probe_outputs.append(avg_probe_output)
116+
all_probe_outputs.append(token_probe_outputs)
113117

114118
if verbose and i % ((len(chunk_data) + 10) // 10) == 0:
115119
print(f"Evaluating {i}/{len(chunk_data)}", end="\t")
116-
print(f"Probe outputs: {probe_outputs}")
120+
print(f"Probe outputs: {token_probe_outputs}")
117121

118122
accuracy = correct / total
119123
if verbose:
120124
print(f".", end="")
121-
return av_probe_outputs, accuracy
125+
if output_token_scores:
126+
return av_probe_outputs, accuracy, all_probe_outputs
127+
else:
128+
return av_probe_outputs, accuracy
Lines changed: 24 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -72,10 +72,11 @@ def __init__(self, test_split, name: str = "", model=None, tokenizer=None, devic
7272
self.activation_cache.remove_hooks()
7373

7474
def get_chunk_path(self, chunk_idx: int) -> str:
75-
return os.path.join(self.activations_dir, f"chunk_{chunk_idx}.pkl")
75+
return os.path.join(os.path.dirname(os.path.abspath(__file__)), self.activations_dir, f"chunk_{chunk_idx}.pkl")
7676

7777
def save_chunk(self, chunk_data: List[Tuple[List[t.Tensor], int]], chunk_idx: int):
78-
os.makedirs(self.activations_dir, exist_ok=True)
78+
full_dir_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), self.activations_dir)
79+
os.makedirs(full_dir_path, exist_ok=True)
7980
chunk_path = self.get_chunk_path(chunk_idx)
8081
with open(chunk_path, 'wb') as f:
8182
pickle.dump(chunk_data, f)
@@ -155,9 +156,9 @@ def get_train_data_stats(self, chunk_idx: int = 0) -> dict:
155156
class TruthfulQADataset(ActivationDataset):
156157
def __init__(self, config: Dict[str, Any]=None, model=None, tokenizer=None, device=None, test_split=None, **kwargs):
157158
super().__init__(test_split, "TruthfulQA", model, tokenizer, device, config["activation_size"])
158-
self.data_path: str = './data/TruthfulQA/TruthfulQA.csv'
159+
self.data_path: str = os.path.join(os.path.dirname(os.path.abspath(__file__)), './data/TruthfulQA/TruthfulQA.csv')
159160
self.tqa_df = pd.read_csv(self.data_path)
160-
self.activations_dir: str = f'./data/{self.name}_{config["short_name"]}_acts_{config["layer"]}/'
161+
self.activations_dir: str = f'data/{self.name}_{config["short_name"]}_acts_{config["layer"]}/'
161162
self.num_total_chunks = 1 # TruthfulQA uses single chunk
162163
self.format = eval(config["short_name"] + "_format")
163164

@@ -203,9 +204,9 @@ def populate_dataset(self, force_redo: bool = False, num_tokens: int = 5, max_ro
203204
class DishonestQADataset(ActivationDataset):
204205
def __init__(self, config: Dict[str, Any]=None, model=None, tokenizer=None, device=None, test_split=None, **kwargs):
205206
super().__init__(test_split, "DishonestQA", model, tokenizer, device, config["activation_size"])
206-
self.data_path: str = './data/TruthfulQA/TruthfulQA.csv'
207+
self.data_path: str = os.path.join(os.path.dirname(os.path.abspath(__file__)), './data/TruthfulQA/TruthfulQA.csv')
207208
self.tqa_df = pd.read_csv(self.data_path)
208-
self.activations_dir: str = f'./data/{self.name}_{config["short_name"]}_acts_{config["layer"]}/'
209+
self.activations_dir: str = f'data/{self.name}_{config["short_name"]}_acts_{config["layer"]}/'
209210
self.num_total_chunks = 1 # DishonestQA uses single chunk
210211
self.format = eval(config["short_name"] + "_format")
211212

@@ -272,11 +273,12 @@ def __init__(
272273
):
273274
super().__init__(test_split, "AmongUs", model, tokenizer, device, config["activation_size"])
274275
self.name: str = "AmongUs"
275-
self.agent_logs_path: str = os.path.join(raw_path, expt_name + "/agent-logs-compact.json")
276-
sys.path.append("..")
276+
base_dir = os.path.dirname(os.path.abspath(__file__))
277+
self.agent_logs_path: str = os.path.join(base_dir, raw_path, expt_name + "/agent-logs-compact.json")
278+
sys.path.append(os.path.join(base_dir, ".."))
277279
from utils import load_agent_logs_df
278280
self.agent_logs_df = load_agent_logs_df(self.agent_logs_path)
279-
self.activations_dir: str = f'./data/{self.name}_{config["short_name"]}_acts_{config["layer"]}/'
281+
self.activations_dir: str = f'data/{self.name}_{config["short_name"]}_acts_{config["layer"]}/'
280282
# load number of chunks from existing directory
281283
self.num_total_chunks = 0
282284
self.format = eval(config["short_name"] + "_format")
@@ -335,11 +337,14 @@ def populate_dataset(
335337
print(f"Loaded {self.num_total_chunks} existing chunks")
336338
return
337339

338-
if force_redo and os.path.exists(self.activations_dir):
339-
import shutil
340-
shutil.rmtree(self.activations_dir)
340+
if force_redo:
341+
full_dir_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), self.activations_dir)
342+
if os.path.exists(full_dir_path):
343+
import shutil
344+
shutil.rmtree(full_dir_path)
341345

342-
os.makedirs(self.activations_dir, exist_ok=True)
346+
full_dir_path = os.path.join(os.path.dirname(os.path.abspath(__file__)), self.activations_dir)
347+
os.makedirs(full_dir_path, exist_ok=True)
343348

344349
# Find last processed chunk
345350
chunk_idx = 0
@@ -383,9 +388,9 @@ def populate_dataset(
383388
class RolePlayingDataset(ActivationDataset):
384389
def __init__(self, config: Dict[str, Any]=None, model=None, tokenizer=None, device=None, test_split=None, **kwargs):
385390
super().__init__(test_split, "Roleplaying", model, tokenizer, device, config["activation_size"])
386-
self.data_path: str = './data/Roleplaying/phi4_rollouts.csv'
391+
self.data_path: str = os.path.join(os.path.dirname(os.path.abspath(__file__)), './data/Roleplaying/phi4_rollouts.csv')
387392
self.df = pd.read_csv(self.data_path)
388-
self.activations_dir: str = f'./data/{self.name}_{config["short_name"]}_acts_{config["layer"]}/'
393+
self.activations_dir: str = f'data/{self.name}_{config["short_name"]}_acts_{config["layer"]}/'
389394
self.num_total_chunks = 1 # Roleplaying uses single chunk
390395
self.format = eval(config["short_name"] + "_format")
391396

@@ -424,9 +429,9 @@ def populate_dataset(self, force_redo: bool = False, num_tokens: int = 5, max_ro
424429
class RepEngDataset(ActivationDataset):
425430
def __init__(self, config: Dict[str, Any]=None, model=None, tokenizer=None, device=None, test_split=None, **kwargs):
426431
super().__init__(test_split, "RepEng", model, tokenizer, device, config["activation_size"])
427-
self.data_path: str = './data/RepE/true_false_facts.csv'
432+
self.data_path: str = os.path.join(os.path.dirname(os.path.abspath(__file__)), './data/RepE/true_false_facts.csv')
428433
self.df = pd.read_csv(self.data_path)
429-
self.activations_dir: str = f'./data/{self.name}_{config["short_name"]}_acts_{config["layer"]}/'
434+
self.activations_dir: str = f'data/{self.name}_{config["short_name"]}_acts_{config["layer"]}/'
430435
self.num_total_chunks = 1 # RepEng uses single chunk
431436
self.format = eval(config["short_name"] + "_format")
432437

@@ -465,9 +470,9 @@ def populate_dataset(self, force_redo: bool = False, num_tokens: int = 5, max_ro
465470
class ApolloProbeDataset(ActivationDataset):
466471
def __init__(self, config: Dict[str, Any]=None, model=None, tokenizer=None, device=None, test_split=None, **kwargs):
467472
super().__init__(test_split, "ApolloProbe", model, tokenizer, device, config["activation_size"])
468-
self.data_path: str = './data/ApolloProbe/common_claim_true_false.csv'
473+
self.data_path: str = os.path.join(os.path.dirname(os.path.abspath(__file__)), './data/ApolloProbe/common_claim_true_false.csv')
469474
self.df = pd.read_csv(self.data_path)
470-
self.activations_dir: str = f'./data/{self.name}_{config["short_name"]}_acts_{config["layer"]}/'
475+
self.activations_dir: str = f'data/{self.name}_{config["short_name"]}_acts_{config["layer"]}/'
471476
self.num_total_chunks = 1 # ApolloProbe uses single chunk
472477
self.format = eval(config["short_name"] + "_format")
473478

Binary file not shown.

linear-probes/train_probes.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
sys.path.append(os.path.dirname(os.path.abspath('.')))
1313
sys.path.append('.')
1414

15-
from datasets import (
15+
from probe_datasets import (
1616
TruthfulQADataset,
1717
DishonestQADataset,
1818
AmongUsDataset,

reports/2025_03_01_aurocs.ipynb

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4285,7 +4285,7 @@
42854285
},
42864286
{
42874287
"cell_type": "code",
4288-
"execution_count": 6,
4288+
"execution_count": 9,
42894289
"metadata": {},
42904290
"outputs": [],
42914291
"source": [

0 commit comments

Comments
 (0)