code-review-metrics/analyze-pr-metrics.py at main · stepful/code-review-metrics · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
#!/usr/bin/env python3

import pandas as pd
import argparse
import matplotlib.pyplot as plt
import numpy as np
from datetime import datetime, timedelta
import seaborn as sns

def load_and_prepare_data(csv_file):
    """Load CSV data and prepare it for weekly analysis."""
    df = pd.read_csv(csv_file)

    # Convert date columns to datetime
    df['created_at'] = pd.to_datetime(df['created_at'])
    df['merged_at'] = pd.to_datetime(df['merged_at'])
    df['first_reviewed_at'] = pd.to_datetime(df['first_reviewed_at'], errors='coerce')

    # Add week columns (Monday 00:00:00 to Sunday 23:59:59)
    # Manually calculate the Monday of each week
    # Monday is weekday 0, so we subtract the current weekday to get to Monday
    df['week_start'] = df['created_at'] - pd.to_timedelta(df['created_at'].dt.weekday, unit='D')
    # Set time to 00:00:00 for Monday
    df['week_start'] = df['week_start'].dt.normalize()
    # Get the Sunday of the week (week_start + 6 days)
    df['week_end'] = df['week_start'] + pd.Timedelta(days=6, hours=23, minutes=59, seconds=59)

    return df

def analyze_by_week_and_author(df):
    """Analyze PR metrics by week and author."""
    # Group by week and author
    weekly_author_stats = df.groupby(['week_start', 'created_by']).agg({
        'number': 'count',  # PR count
        'lines_changed': ['mean', 'sum'],  # Average and total lines changed
        'cycle_time_minutes': 'mean',  # Average cycle time
        'lead_time_minutes': 'mean',  # Average lead time
        'comments_added': 'mean'  # Average comments
    }).round(2)

    # Flatten column names
    weekly_author_stats.columns = [
        'pr_count', 'avg_lines_changed', 'total_lines_changed',
        'avg_cycle_time_minutes', 'avg_lead_time_minutes', 'avg_comments'
    ]

    return weekly_author_stats.reset_index()

def create_weekly_analysis_plots(df, weekly_stats, output_dir='.'):
    """Create comprehensive weekly analysis plots."""

    # Set up the plotting style
    plt.style.use('default')
    sns.set_palette("husl")

    # Create a large figure with multiple subplots
    fig, axes = plt.subplots(3, 2, figsize=(16, 12))
    fig.suptitle('Weekly PR Metrics Analysis by Author', fontsize=16, fontweight='bold')

    # Get unique authors for color coding
    authors = df['created_by'].unique()
    colors = plt.cm.Set3(np.linspace(0, 1, len(authors)))
    author_colors = dict(zip(authors, colors))

    # 1. PR Count by Week and Author
    ax1 = axes[0, 0]
    for author in authors:
        author_data = weekly_stats[weekly_stats['created_by'] == author]
        if not author_data.empty:
            ax1.plot(author_data['week_start'], author_data['pr_count'],
                    marker='o', label=author, color=author_colors[author], linewidth=2)

    ax1.set_title('PR Count by Week and Author')
    ax1.set_xlabel('Week Start')
    ax1.set_ylabel('Number of PRs')
    ax1.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
    ax1.grid(True, alpha=0.3)
    ax1.tick_params(axis='x', rotation=45)

    # 2. Average Lines Changed by Week and Author
    ax2 = axes[0, 1]
    for author in authors:
        author_data = weekly_stats[weekly_stats['created_by'] == author]
        if not author_data.empty:
            ax2.plot(author_data['week_start'], author_data['avg_lines_changed'],
                    marker='s', label=author, color=author_colors[author], linewidth=2)

    ax2.set_title('Average Lines Changed by Week and Author')
    ax2.set_xlabel('Week Start')
    ax2.set_ylabel('Average Lines Changed')
    ax2.grid(True, alpha=0.3)
    ax2.tick_params(axis='x', rotation=45)

    # 3. Total Lines Changed by Week and Author
    ax3 = axes[1, 0]
    for author in authors:
        author_data = weekly_stats[weekly_stats['created_by'] == author]
        if not author_data.empty:
            ax3.plot(author_data['week_start'], author_data['total_lines_changed'],
                    marker='^', label=author, color=author_colors[author], linewidth=2)

    ax3.set_title('Total Lines Changed by Week and Author')
    ax3.set_xlabel('Week Start')
    ax3.set_ylabel('Total Lines Changed')
    ax3.grid(True, alpha=0.3)
    ax3.tick_params(axis='x', rotation=45)

    # 4. Average Cycle Time by Week and Author
    ax4 = axes[1, 1]
    for author in authors:
        author_data = weekly_stats[weekly_stats['created_by'] == author]
        if not author_data.empty:
            ax4.plot(author_data['week_start'], author_data['avg_cycle_time_minutes'],
                    marker='d', label=author, color=author_colors[author], linewidth=2)

    ax4.set_title('Average Cycle Time by Week and Author')
    ax4.set_xlabel('Week Start')
    ax4.set_ylabel('Average Cycle Time (minutes)')
    ax4.grid(True, alpha=0.3)
    ax4.tick_params(axis='x', rotation=45)

    # 5. Average Lead Time by Week and Author
    ax5 = axes[2, 0]
    for author in authors:
        author_data = weekly_stats[weekly_stats['created_by'] == author]
        if not author_data.empty:
            ax5.plot(author_data['week_start'], author_data['avg_lead_time_minutes'],
                    marker='v', label=author, color=author_colors[author], linewidth=2)

    ax5.set_title('Average Lead Time by Week and Author')
    ax5.set_xlabel('Week Start')
    ax5.set_ylabel('Average Lead Time (minutes)')
    ax5.grid(True, alpha=0.3)
    ax5.tick_params(axis='x', rotation=45)

    # 6. Average Comments by Week and Author
    ax6 = axes[2, 1]
    for author in authors:
        author_data = weekly_stats[weekly_stats['created_by'] == author]
        if not author_data.empty:
            ax6.plot(author_data['week_start'], author_data['avg_comments'],
                    marker='*', label=author, color=author_colors[author], linewidth=2)

    ax6.set_title('Average Comments by Week and Author')
    ax6.set_xlabel('Week Start')
    ax6.set_ylabel('Average Comments')
    ax6.grid(True, alpha=0.3)
    ax6.tick_params(axis='x', rotation=45)

    # Adjust layout and save
    plt.tight_layout()
    plt.savefig(f'{output_dir}/weekly_analysis.png', dpi=300, bbox_inches='tight')
    plt.show()

def create_author_summary_table(df, weekly_stats, output_dir='.'):
    """Create a summary table of author statistics."""

    # Overall author statistics
    author_summary = df.groupby('created_by').agg({
        'number': 'count',  # Total PRs
        'lines_changed': ['mean', 'sum', 'std'],  # Lines changed stats
        'cycle_time_minutes': ['mean', 'median'],  # Cycle time stats
        'lead_time_minutes': ['mean', 'median'],  # Lead time stats
        'comments_added': 'mean'  # Average comments
    }).round(2)

    # Flatten column names
    author_summary.columns = [
        'total_prs', 'avg_lines_changed', 'total_lines_changed', 'std_lines_changed',
        'avg_cycle_time', 'median_cycle_time', 'avg_lead_time', 'median_lead_time', 'avg_comments'
    ]

    # Add weekly averages
    weekly_avg = weekly_stats.groupby('created_by').agg({
        'pr_count': 'mean',
        'avg_lines_changed': 'mean',
        'avg_cycle_time_minutes': 'mean',
        'avg_lead_time_minutes': 'mean'
    }).round(2)

    weekly_avg.columns = ['avg_prs_per_week', 'avg_lines_per_week', 'avg_cycle_time_per_week', 'avg_lead_time_per_week']

    # Combine the summaries
    final_summary = pd.concat([author_summary, weekly_avg], axis=1)

    # Save to CSV
    final_summary.to_csv(f'{output_dir}/author_summary.csv')

    # Print summary
    print("\n=== AUTHOR SUMMARY STATISTICS ===")
    print(final_summary)

    return final_summary

def export_weekly_data(weekly_stats, output_dir='.'):
    """Export weekly statistics to CSV."""
    weekly_stats.to_csv(f'{output_dir}/weekly_author_stats.csv', index=False)
    print(f"\nWeekly statistics exported to: {output_dir}/weekly_author_stats.csv")

def export_detailed_weekly_data(df, output_dir='.'):
    """Export detailed weekly data including raw PR counts and additional statistics."""

    # Create detailed weekly analysis with more granular data
    detailed_weekly = df.groupby(['week_start', 'created_by']).agg({
        'number': ['count', 'min', 'max'],  # PR count and PR number range
        'lines_changed': ['count', 'mean', 'sum', 'min', 'max', 'std'],  # Lines changed stats
        'cycle_time_minutes': ['count', 'mean', 'median', 'min', 'max', 'std'],  # Cycle time stats
        'lead_time_minutes': ['count', 'mean', 'median', 'min', 'max', 'std'],  # Lead time stats
        'comments_added': ['count', 'mean', 'sum', 'min', 'max', 'std']  # Comments stats
    }).round(2)

    # Flatten column names
    detailed_weekly.columns = [
        'pr_count', 'pr_number_min', 'pr_number_max',
        'lines_count', 'lines_mean', 'lines_sum', 'lines_min', 'lines_max', 'lines_std',
        'cycle_count', 'cycle_mean', 'cycle_median', 'cycle_min', 'cycle_max', 'cycle_std',
        'lead_count', 'lead_mean', 'lead_median', 'lead_min', 'lead_max', 'lead_std',
        'comments_count', 'comments_mean', 'comments_sum', 'comments_min', 'comments_max', 'comments_std'
    ]

    # Reset index to make week_start and created_by regular columns
    detailed_weekly = detailed_weekly.reset_index()

    # Add some derived metrics
    detailed_weekly['avg_pr_size'] = detailed_weekly['lines_sum'] / detailed_weekly['pr_count']
    detailed_weekly['productivity_score'] = detailed_weekly['pr_count'] * detailed_weekly['avg_pr_size']
    detailed_weekly['efficiency_score'] = detailed_weekly['pr_count'] / (detailed_weekly['cycle_mean'] + 1)  # +1 to avoid division by zero

    # Round the new columns
    detailed_weekly[['avg_pr_size', 'productivity_score', 'efficiency_score']] = detailed_weekly[['avg_pr_size', 'productivity_score', 'efficiency_score']].round(2)

    # Save detailed weekly data
    detailed_weekly.to_csv(f'{output_dir}/detailed_weekly_author_stats.csv', index=False)
    print(f"Detailed weekly statistics exported to: {output_dir}/detailed_weekly_author_stats.csv")

    return detailed_weekly

def export_raw_pr_data_by_week(df, output_dir='.'):
    """Export raw PR data grouped by week and author for detailed analysis."""

    # Add week information to the original dataframe (week_start and week_end already exist)
    df_with_week = df.copy()

    # Sort by week and author for better readability
    df_with_week = df_with_week.sort_values(['week_start', 'created_by', 'created_at'])

    # Select relevant columns for export
    export_columns = [
        'week_start', 'week_end', 'created_by', 'number', 'title', 'url',
        'created_at', 'merged_at', 'first_reviewed_at', 'first_reviewed_by',
        'cycle_time_minutes', 'lead_time_minutes', 'lines_changed', 'comments_added'
    ]

    # Export raw data
    df_with_week[export_columns].to_csv(f'{output_dir}/raw_prs_by_week.csv', index=False)
    print(f"Raw PR data by week exported to: {output_dir}/raw_prs_by_week.csv")

    return df_with_week

def main():
    parser = argparse.ArgumentParser(description='Analyze PR metrics by week and author')
    parser.add_argument('csv_file', help='Path to the CSV file with PR metrics')
    parser.add_argument('-o', '--output-dir', default='.', help='Output directory for generated files')
    parser.add_argument('--no-plots', action='store_true', help='Skip generating plots')

    args = parser.parse_args()

    print(f"Loading data from {args.csv_file}...")
    df = load_and_prepare_data(args.csv_file)

    print(f"Analyzing {len(df)} PRs from {df['created_at'].min().date()} to {df['created_at'].max().date()}")
    print(f"Authors found: {', '.join(df['created_by'].unique())}")

    # Analyze by week and author
    weekly_stats = analyze_by_week_and_author(df)

    # Export weekly data
    export_weekly_data(weekly_stats, args.output_dir)

    # Export detailed weekly data with comprehensive statistics
    detailed_weekly = export_detailed_weekly_data(df, args.output_dir)

    # Export raw PR data grouped by week
    raw_data = export_raw_pr_data_by_week(df, args.output_dir)

    # Create author summary
    author_summary = create_author_summary_table(df, weekly_stats, args.output_dir)

    # Create plots (unless disabled)
    if not args.no_plots:
        print("Generating plots...")
        create_weekly_analysis_plots(df, weekly_stats, args.output_dir)
        print(f"Plots saved to: {args.output_dir}/weekly_analysis.png")

    print(f"\nAnalysis complete! Files saved to: {args.output_dir}/")
    print("\nGenerated files:")
    print("- weekly_author_stats.csv: Basic weekly statistics")
    print("- detailed_weekly_author_stats.csv: Comprehensive weekly statistics with derived metrics")
    print("- raw_prs_by_week.csv: Raw PR data organized by week and author")
    print("- author_summary.csv: Overall author performance summary")
    if not args.no_plots:
        print("- weekly_analysis.png: Visual dashboard of weekly trends")

if __name__ == "__main__":
    main()