Files
sci-gui-agent-benchmark/annotation/experiments/stopwatch/stopwatch.py

111 lines
3.9 KiB
Python

import glob
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.stats as stats
import seaborn as sns
# use this: https://www.estopwatch.net/
def read_file(file_path):
df = pd.read_csv(file_path)
df['Elapsed time'] = pd.to_datetime(df['Elapsed time'], errors='coerce')
return df
def analyze_new_error(run_df, groundtruth_df):
cumulative_errors = run_df['Elapsed time'] - groundtruth_df['Elapsed time']
cumulative_errors_in_seconds = cumulative_errors.dt.total_seconds()
new_errors_in_seconds = cumulative_errors_in_seconds.diff().fillna(cumulative_errors_in_seconds[0])
new_error_points = new_errors_in_seconds[new_errors_in_seconds != 0].index.tolist()
return new_errors_in_seconds[new_error_points]
def calculate_statistics(errors):
if len(errors) == 0:
return {
'mean_error': 0,
'median_error': 0,
'stddev_error': 0,
'rmse_error': 0,
'confidence_interval': (0, 0),
'error_frequency': 0
}
mean_error = np.mean(errors)
median_error = np.median(errors)
stddev_error = np.std(errors)
rmse_error = np.sqrt(np.mean(np.square(errors)))
ci_low, ci_high = stats.t.interval(
confidence=0.95,
df=len(errors) - 1,
loc=mean_error,
scale=stats.sem(errors) if len(errors) > 1 else 0
)
return {
'mean_error': mean_error,
'median_error': median_error,
'stddev_error': stddev_error,
'rmse_error': rmse_error,
'confidence_interval': (ci_low, ci_high),
}
def main():
groundtruth_file = 'groundtruth.csv'
run_files = glob.glob('runs/*.csv')
groundtruth_df = read_file(groundtruth_file)
run_dfs = {f'run{i+1}': read_file(file) for i, file in enumerate(run_files)}
total_errors = []
total_points = 0
all_errors = []
for run, df in run_dfs.items():
errors = analyze_new_error(df, groundtruth_df)
total_errors.extend(errors)
all_errors.extend(errors)
total_points += len(df)
results = calculate_statistics(errors)
error_frequency = len(errors) / len(df)
print(f"Results for {run}:")
print(f"Mean New Error: {results['mean_error']:.5f} seconds")
print(f"Median New Error: {results['median_error']:.5f} seconds")
print(f"Standard Deviation of New Error: {results['stddev_error']:.5f} seconds")
print(f"RMSE of New Error: {results['rmse_error']:.5f} seconds")
print(f"95% Confidence Interval of New Error: ({results['confidence_interval'][0]:.5f}, {results['confidence_interval'][1]:.5f}) seconds")
print(f"New Error Frequency: {error_frequency*100:.5f} %")
print('-----------------------------------------')
total_results = calculate_statistics(total_errors)
total_error_frequency = len(total_errors) / total_points
print("Total Statistics:")
print(f"Mean New Error: {total_results['mean_error']:.5f} seconds")
print(f"Median New Error: {total_results['median_error']:.5f} seconds")
print(f"Standard Deviation of New Error: {total_results['stddev_error']:.5f} seconds")
print(f"RMSE of New Error: {total_results['rmse_error']:.5f} seconds")
print(f"95% Confidence Interval of New Error: ({total_results['confidence_interval'][0]:.5f}, {total_results['confidence_interval'][1]:.5f}) seconds")
print(f"New Error Frequency: {total_error_frequency*100:.5f} %")
# do plus minus
print(f"New Error: {total_results['mean_error']:.5f} ± {total_results['confidence_interval'][1] - total_results['mean_error']:.5f} seconds")
plt.figure(figsize=(10, 5))
sns.histplot(all_errors, bins=12, kde=False)
plt.title('Distribution of Newly Introduced Errors (macOS)')
plt.xlabel('Error Duration (seconds)')
plt.ylabel('Frequency')
plt.savefig('error_dist', dpi=300)
plt.show()
if __name__ == "__main__":
main()