Python: File writing and logging
Python Script for Processing CSV Files and Logging This Python script processes CSV files containing first_name and last_name columns, extracts unique combinations of these columns, and saves the results to a new CSV file. It also includes robust logging to track the process and handle errors. What Does This Script Do? Logging Setup: Logs messages to both a log file (process.log) and the console. Supports different log levels: info, warning, error, and debug. Data Processing: Reads all CSV files from a specified input directory. Extracts unique first_name and last_name combinations from each file. Tracks the source file for each unique combination. Error Handling: Skips files that don’t have the required columns (first_name and last_name). Logs errors if any issues occur during file processing. Output: Saves the combined unique data to a new CSV file in a specified output directory. Code python import pandas as pd import glob import os import logging # ------------------------- # Logging Setup # ------------------------- # Define log directory and file log_dir = r"C:\Users\YourUsername\Documents\output\logs" os.makedirs(log_dir, exist_ok=True) log_file = os.path.join(log_dir, "process.log") # Configure logger logger = logging.getLogger("DataProcessLogger") logger.setLevel(logging.INFO) # File handler (for log file output) file_handler = logging.FileHandler(log_file) file_handler.setLevel(logging.INFO) file_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s') file_handler.setFormatter(file_formatter) # Stream handler (for stdout output) stream_handler = logging.StreamHandler() stream_handler.setLevel(logging.INFO) stream_formatter = logging.Formatter('%(message)s') stream_handler.setFormatter(stream_formatter) # Add both handlers to the logger logger.addHandler(file_handler) logger.addHandler(stream_handler) def log_message(message, level="info"): """ Log a message. level options: "info", "warning", "error", "debug". Messages are sent to both stdout and the log file. """ level = level.lower() if level == "info": logger.info(message) elif level == "warning": logger.warning(message) elif level == "error": logger.error(message) else: logger.debug(message) # ------------------------- # Data Output Helper Function # ------------------------- def save_data(data, filename, output_dir): """ Save a DataFrame to a CSV file in a specified output directory. Logs a message indicating where the data was saved. """ os.makedirs(output_dir, exist_ok=True) output_file = os.path.join(output_dir, filename) data.to_csv(output_file, index=False) log_message(f"Data saved to {output_file}", level="info") # ------------------------- # Processing CSV Files # ------------------------- # Define the input directory for CSV files input_dir = r"C:\Users\YourUsername\Documents\data\\" csv_files = glob.glob(input_dir + "*.csv") # Define the output directory for real data (e.g., unique first_name/last_name combos) data_output_dir = r"C:\Users\YourUsername\Documents\output\data" os.makedirs(data_output_dir, exist_ok=True) # List to hold each file's unique first_name/last_name pairs unique_dfs = [] # Process each CSV file for file in csv_files: try: # Force all columns to be read as text df = pd.read_csv(file, dtype=str) file_name = os.path.basename(file) # Check that the required columns exist before processing if {'first_name', 'last_name'}.issubset(df.columns): unique_df = df[['first_name', 'last_name']].drop_duplicates() # Append the source file name as a new column unique_df['source_file'] = file_name unique_dfs.append(unique_df) log_message(f"Processed file: {file_name}", level="info") else: log_message(f"Skipping file: {file_name} - Missing required columns", level="warning") except Exception as e: log_message(f"Error processing file: {file} - {e}", level="error") # Combine the unique first_name/last_name pairs from all processed files if unique_dfs: combined_unique_df = pd.concat(unique_dfs, ignore_index=True) else: combined_unique_df = pd.DataFrame() # ------------------------- # Save the Unique Data to a CSV File (Real Data Output) # ------------------------- save_data(combined_unique_df, "unique_first_name_last_name.csv", data_output_dir)

Python Script for Processing CSV Files and Logging
This Python script processes CSV files containing first_name
and last_name
columns, extracts unique combinations of these columns, and saves the results to a new CSV file. It also includes robust logging to track the process and handle errors.
What Does This Script Do?
-
Logging Setup:
- Logs messages to both a log file (
process.log
) and the console. - Supports different log levels:
info
,warning
,error
, anddebug
.
- Logs messages to both a log file (
-
Data Processing:
- Reads all CSV files from a specified input directory.
- Extracts unique
first_name
andlast_name
combinations from each file. - Tracks the source file for each unique combination.
-
Error Handling:
- Skips files that don’t have the required columns (
first_name
andlast_name
). - Logs errors if any issues occur during file processing.
- Skips files that don’t have the required columns (
-
Output:
- Saves the combined unique data to a new CSV file in a specified output directory.
Code
python
import pandas as pd
import glob
import os
import logging
# -------------------------
# Logging Setup
# -------------------------
# Define log directory and file
log_dir = r"C:\Users\YourUsername\Documents\output\logs"
os.makedirs(log_dir, exist_ok=True)
log_file = os.path.join(log_dir, "process.log")
# Configure logger
logger = logging.getLogger("DataProcessLogger")
logger.setLevel(logging.INFO)
# File handler (for log file output)
file_handler = logging.FileHandler(log_file)
file_handler.setLevel(logging.INFO)
file_formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
file_handler.setFormatter(file_formatter)
# Stream handler (for stdout output)
stream_handler = logging.StreamHandler()
stream_handler.setLevel(logging.INFO)
stream_formatter = logging.Formatter('%(message)s')
stream_handler.setFormatter(stream_formatter)
# Add both handlers to the logger
logger.addHandler(file_handler)
logger.addHandler(stream_handler)
def log_message(message, level="info"):
"""
Log a message.
level options: "info", "warning", "error", "debug".
Messages are sent to both stdout and the log file.
"""
level = level.lower()
if level == "info":
logger.info(message)
elif level == "warning":
logger.warning(message)
elif level == "error":
logger.error(message)
else:
logger.debug(message)
# -------------------------
# Data Output Helper Function
# -------------------------
def save_data(data, filename, output_dir):
"""
Save a DataFrame to a CSV file in a specified output directory.
Logs a message indicating where the data was saved.
"""
os.makedirs(output_dir, exist_ok=True)
output_file = os.path.join(output_dir, filename)
data.to_csv(output_file, index=False)
log_message(f"Data saved to {output_file}", level="info")
# -------------------------
# Processing CSV Files
# -------------------------
# Define the input directory for CSV files
input_dir = r"C:\Users\YourUsername\Documents\data\\"
csv_files = glob.glob(input_dir + "*.csv")
# Define the output directory for real data (e.g., unique first_name/last_name combos)
data_output_dir = r"C:\Users\YourUsername\Documents\output\data"
os.makedirs(data_output_dir, exist_ok=True)
# List to hold each file's unique first_name/last_name pairs
unique_dfs = []
# Process each CSV file
for file in csv_files:
try:
# Force all columns to be read as text
df = pd.read_csv(file, dtype=str)
file_name = os.path.basename(file)
# Check that the required columns exist before processing
if {'first_name', 'last_name'}.issubset(df.columns):
unique_df = df[['first_name', 'last_name']].drop_duplicates()
# Append the source file name as a new column
unique_df['source_file'] = file_name
unique_dfs.append(unique_df)
log_message(f"Processed file: {file_name}", level="info")
else:
log_message(f"Skipping file: {file_name} - Missing required columns", level="warning")
except Exception as e:
log_message(f"Error processing file: {file} - {e}", level="error")
# Combine the unique first_name/last_name pairs from all processed files
if unique_dfs:
combined_unique_df = pd.concat(unique_dfs, ignore_index=True)
else:
combined_unique_df = pd.DataFrame()
# -------------------------
# Save the Unique Data to a CSV File (Real Data Output)
# -------------------------
save_data(combined_unique_df, "unique_first_name_last_name.csv", data_output_dir)