Python: File writing and logging

Python Script for Processing CSV Files and Logging This Python script processes CSV files containing first_name and last_name columns, extracts unique combinations of these columns, and saves the results to a new CSV file. It also includes robust logging to track the process and handle errors. What Does This Script Do? Logging Setup: Logs messages to both a log file (process.log) and the console. Supports different log levels: info, warning, error, and debug. Data Processing: Reads all CSV files from a specified input directory. Extracts unique first_name and last_name combinations from each file. Tracks the source file for each unique combination. Error Handling: Skips files that don’t have the required columns (first_name and last_name). Logs errors if any issues occur during file processing. Output: Saves the combined unique data to a new CSV file in a specified output directory. Code python import pandas as pd import glob import os import logging # ------------------------- # Logging Setup # ------------------------- # Define log directory and file log_dir = r"C:\Users\YourUsername\Documents\output\logs" os.makedirs(log_dir, exist_ok=True) log_file = os.path.join(log_dir, "process.log") # Configure logger logger = logging.getLogger("DataProcessLogger") logger.setLevel(logging.INFO) # File handler (for log file output) file_handler = logging.FileHandler(log_file) file_handler.setLevel(logging.INFO) file_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') file_handler.setFormatter(file_formatter) # Stream handler (for stdout output) stream_handler = logging.StreamHandler() stream_handler.setLevel(logging.INFO) stream_formatter = logging.Formatter('%(message)s') stream_handler.setFormatter(stream_formatter) # Add both handlers to the logger logger.addHandler(file_handler) logger.addHandler(stream_handler) def log_message(message, level="info"): """ Log a message. level options: "info", "warning", "error", "debug". Messages are sent to both stdout and the log file. """ level = level.lower() if level == "info": logger.info(message) elif level == "warning": logger.warning(message) elif level == "error": logger.error(message) else: logger.debug(message) # ------------------------- # Data Output Helper Function # ------------------------- def save_data(data, filename, output_dir): """ Save a DataFrame to a CSV file in a specified output directory. Logs a message indicating where the data was saved. """ os.makedirs(output_dir, exist_ok=True) output_file = os.path.join(output_dir, filename) data.to_csv(output_file, index=False) log_message(f"Data saved to {output_file}", level="info") # ------------------------- # Processing CSV Files # ------------------------- # Define the input directory for CSV files input_dir = r"C:\Users\YourUsername\Documents\data\\" csv_files = glob.glob(input_dir + "*.csv") # Define the output directory for real data (e.g., unique first_name/last_name combos) data_output_dir = r"C:\Users\YourUsername\Documents\output\data" os.makedirs(data_output_dir, exist_ok=True) # List to hold each file's unique first_name/last_name pairs unique_dfs = [] # Process each CSV file for file in csv_files: try: # Force all columns to be read as text df = pd.read_csv(file, dtype=str) file_name = os.path.basename(file) # Check that the required columns exist before processing if {'first_name', 'last_name'}.issubset(df.columns): unique_df = df[['first_name', 'last_name']].drop_duplicates() # Append the source file name as a new column unique_df['source_file'] = file_name unique_dfs.append(unique_df) log_message(f"Processed file: {file_name}", level="info") else: log_message(f"Skipping file: {file_name} - Missing required columns", level="warning") except Exception as e: log_message(f"Error processing file: {file} - {e}", level="error") # Combine the unique first_name/last_name pairs from all processed files if unique_dfs: combined_unique_df = pd.concat(unique_dfs, ignore_index=True) else: combined_unique_df = pd.DataFrame() # ------------------------- # Save the Unique Data to a CSV File (Real Data Output) # ------------------------- save_data(combined_unique_df, "unique_first_name_last_name.csv", data_output_dir)

Mar 13, 2025 - 17:04
 0
Python: File writing and logging

Python Script for Processing CSV Files and Logging

This Python script processes CSV files containing first_name and last_name columns, extracts unique combinations of these columns, and saves the results to a new CSV file. It also includes robust logging to track the process and handle errors.

What Does This Script Do?

  1. Logging Setup:

    • Logs messages to both a log file (process.log) and the console.
    • Supports different log levels: info, warning, error, and debug.
  2. Data Processing:

    • Reads all CSV files from a specified input directory.
    • Extracts unique first_name and last_name combinations from each file.
    • Tracks the source file for each unique combination.
  3. Error Handling:

    • Skips files that don’t have the required columns (first_name and last_name).
    • Logs errors if any issues occur during file processing.
  4. Output:

    • Saves the combined unique data to a new CSV file in a specified output directory.

Code


python
import pandas as pd
import glob
import os
import logging

# -------------------------
# Logging Setup
# -------------------------

# Define log directory and file
log_dir = r"C:\Users\YourUsername\Documents\output\logs"
os.makedirs(log_dir, exist_ok=True)
log_file = os.path.join(log_dir, "process.log")

# Configure logger
logger = logging.getLogger("DataProcessLogger")
logger.setLevel(logging.INFO)

# File handler (for log file output)
file_handler = logging.FileHandler(log_file)
file_handler.setLevel(logging.INFO)
file_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
file_handler.setFormatter(file_formatter)

# Stream handler (for stdout output)
stream_handler = logging.StreamHandler()
stream_handler.setLevel(logging.INFO)
stream_formatter = logging.Formatter('%(message)s')
stream_handler.setFormatter(stream_formatter)

# Add both handlers to the logger
logger.addHandler(file_handler)
logger.addHandler(stream_handler)

def log_message(message, level="info"):
    """
    Log a message.

    level options: "info", "warning", "error", "debug".
    Messages are sent to both stdout and the log file.
    """
    level = level.lower()
    if level == "info":
        logger.info(message)
    elif level == "warning":
        logger.warning(message)
    elif level == "error":
        logger.error(message)
    else:
        logger.debug(message)

# -------------------------
# Data Output Helper Function
# -------------------------

def save_data(data, filename, output_dir):
    """
    Save a DataFrame to a CSV file in a specified output directory.
    Logs a message indicating where the data was saved.
    """
    os.makedirs(output_dir, exist_ok=True)
    output_file = os.path.join(output_dir, filename)
    data.to_csv(output_file, index=False)
    log_message(f"Data saved to {output_file}", level="info")

# -------------------------
# Processing CSV Files
# -------------------------

# Define the input directory for CSV files
input_dir = r"C:\Users\YourUsername\Documents\data\\"
csv_files = glob.glob(input_dir + "*.csv")

# Define the output directory for real data (e.g., unique first_name/last_name combos)
data_output_dir = r"C:\Users\YourUsername\Documents\output\data"
os.makedirs(data_output_dir, exist_ok=True)

# List to hold each file's unique first_name/last_name pairs
unique_dfs = []

# Process each CSV file
for file in csv_files:
    try:
        # Force all columns to be read as text
        df = pd.read_csv(file, dtype=str)
        file_name = os.path.basename(file)

        # Check that the required columns exist before processing
        if {'first_name', 'last_name'}.issubset(df.columns):
            unique_df = df[['first_name', 'last_name']].drop_duplicates()
            # Append the source file name as a new column
            unique_df['source_file'] = file_name
            unique_dfs.append(unique_df)
            log_message(f"Processed file: {file_name}", level="info")
        else:
            log_message(f"Skipping file: {file_name} - Missing required columns", level="warning")
    except Exception as e:
        log_message(f"Error processing file: {file} - {e}", level="error")

# Combine the unique first_name/last_name pairs from all processed files
if unique_dfs:
    combined_unique_df = pd.concat(unique_dfs, ignore_index=True)
else:
    combined_unique_df = pd.DataFrame()

# -------------------------
# Save the Unique Data to a CSV File (Real Data Output)
# -------------------------

save_data(combined_unique_df, "unique_first_name_last_name.csv", data_output_dir)