74 lines
2.8 KiB
Python
74 lines
2.8 KiB
Python
import seaborn as sns
|
|
import matplotlib.pyplot as plt
|
|
from pathlib import Path
|
|
import tempfile
|
|
import logging
|
|
import pandas as pd
|
|
|
|
def generate(processed_df: pd.DataFrame):
|
|
"""
|
|
Generates a histogram of the task time estimate midpoints.
|
|
|
|
This generator corresponds to 'cell1' from the original analysis notebook.
|
|
It visualizes the distribution of the calculated midpoint of time estimates
|
|
for all tasks on a logarithmic scale to handle the wide range of values.
|
|
|
|
Args:
|
|
processed_df (pd.DataFrame): The preprocessed data, expected to contain
|
|
'lb_estimate_in_minutes' and
|
|
'ub_estimate_in_minutes' columns.
|
|
|
|
Returns:
|
|
Path: The path to the generated temporary image file, or None if
|
|
generation fails.
|
|
"""
|
|
logging.info("Generating task estimate distribution plot...")
|
|
|
|
# --- Data Validation and Preparation ---
|
|
required_cols = ['lb_estimate_in_minutes', 'ub_estimate_in_minutes']
|
|
if not all(col in processed_df.columns for col in required_cols):
|
|
logging.error(
|
|
f"Required columns {required_cols} not found in the DataFrame. "
|
|
"Cannot generate plot."
|
|
)
|
|
return None
|
|
|
|
# Create a copy to avoid modifying the original DataFrame
|
|
df = processed_df.copy()
|
|
|
|
# Calculate the midpoint from lower and upper bounds, as was done in the notebook
|
|
df['estimate_midpoint'] = (df['lb_estimate_in_minutes'] + df['ub_estimate_in_minutes']) / 2
|
|
|
|
# For log scaling, we must use positive values. Filter out any non-positive midpoints.
|
|
df = df[df['estimate_midpoint'] > 0]
|
|
if df.empty:
|
|
logging.warning("No data with positive estimate midpoints available to plot.")
|
|
return None
|
|
|
|
# --- Plotting ---
|
|
try:
|
|
plt.figure(figsize=(10, 6))
|
|
ax = sns.histplot(data=df, x='estimate_midpoint', log_scale=True)
|
|
|
|
ax.set_title('Distribution of Task Time Estimate Midpoints', fontsize=16)
|
|
ax.set_xlabel('Estimate Midpoint (minutes, log scale)', fontsize=12)
|
|
ax.set_ylabel('Number of Tasks', fontsize=12)
|
|
plt.tight_layout()
|
|
|
|
# --- File Saving ---
|
|
# Create a temporary file to save the plot. The orchestrator (`generate.py`)
|
|
# will move this to the final 'dist/' directory.
|
|
temp_dir = tempfile.gettempdir()
|
|
temp_path = Path(temp_dir) / "task_estimate_distribution.png"
|
|
|
|
plt.savefig(temp_path, dpi=300)
|
|
logging.info(f"Successfully saved plot to temporary file: {temp_path}")
|
|
|
|
return temp_path
|
|
|
|
except Exception as e:
|
|
logging.error(f"An error occurred while generating the plot: {e}", exc_info=True)
|
|
return None
|
|
finally:
|
|
# Close the figure to free up memory, which is crucial when running many generators.
|
|
plt.close()
|