公司有个很简单的需求:
我用 python 写了一个脚本,GUI 使用的是 pyqt5
# -*- coding: utf-8 -*-
import sys
import os
import csv
import pandas as pd
from datetime import datetime
from PyQt5.QtWidgets import (
QApplication, QMainWindow, QWidget, QPushButton, QLineEdit,
QVBoxLayout, QHBoxLayout, QFileDialog, QDateTimeEdit, QLabel,
QMessageBox, QProgressBar, QStatusBar,
)
from PyQt5.QtCore import QDateTime
from PyQt5.QtGui import QIntValidator
class CSV_Filter(QMainWindow):
def __init__(self):
super().__init__()
self.central_widget = QWidget()
self.setCentralWidget(self.central_widget)
self.init_ui()
def init_ui(self):
# 创建组件
self.input_select_button = QPushButton('浏览...')
self.input_path_text = QLineEdit()
self.input_path_text.setReadOnly(True)
self.datetime_start = QDateTimeEdit()
self.datetime_end = QDateTimeEdit()
self.time_diff_input = QLineEdit()
self.start_button = QPushButton('开始合并')
self.export_select_button = QPushButton('浏览...')
self.export_path_text = QLineEdit()
self.export_path_text.setReadOnly(True)
# 创建进度条和状态栏
self.progress_bar = QProgressBar()
self.status_bar = QStatusBar()
self.setStatusBar(self.status_bar)
self.status_bar.addPermanentWidget(self.progress_bar)
self.progress_bar.setValue(0)
# 设置日期时间选择框
debug_time = QDateTime(2024, 5, 3, 19, 10)
self.datetime_start.setDateTime(debug_time)
self.datetime_end.setDateTime(debug_time)
#now = QDateTime.currentDateTime()
#self.datetime_start.setDateTime(now)
#self.datetime_end.setDateTime(now)
self.datetime_start.setCalendarPopup(True)
self.datetime_end.setCalendarPopup(True)
self.datetime_start.setDisplayFormat("yyyy-MM-dd HH:mm")
self.datetime_end.setDisplayFormat("yyyy-MM-dd HH:mm")
self.time_diff_input.setPlaceholderText("输入分钟数")
self.time_diff_input.setValidator(QIntValidator())
self.start_button.setEnabled(False)
layout = QVBoxLayout()
layout.addWidget(QLabel("选择 log 路径:"))
input_path_layout = QHBoxLayout()
input_path_layout.addWidget(self.input_path_text)
input_path_layout.addWidget(self.input_select_button)
layout.addLayout(input_path_layout)
layout.addWidget(QLabel("选择导出路径:"))
export_path_layout = QHBoxLayout()
export_path_layout.addWidget(self.export_path_text)
export_path_layout.addWidget(self.export_select_button)
layout.addLayout(export_path_layout)
layout.addWidget(QLabel("开始时间:"))
layout.addWidget(self.datetime_start)
layout.addWidget(QLabel("时间差(分钟):"))
layout.addWidget(self.time_diff_input)
layout.addWidget(QLabel("结束时间:"))
layout.addWidget(self.datetime_end)
layout.addWidget(self.start_button)
self.central_widget.setLayout(layout)
self.setWindowTitle('CSV-Filter')
self.input_select_button.clicked.connect(self.select_input_folder)
self.export_select_button.clicked.connect(self.select_export_folder)
self.start_button.clicked.connect(self.merge_csv)
self.input_path_text.textChanged.connect(self.check_inputs)
self.export_path_text.textChanged.connect(self.check_inputs)
self.datetime_start.dateTimeChanged.connect(self.update_time_diff)
self.datetime_end.dateTimeChanged.connect(self.update_time_diff)
self.time_diff_input.textChanged.connect(self.update_end_time_from_diff)
def select_input_folder(self):
folder_path = QFileDialog.getExistingDirectory(self, '选择 log 所在的文件夹')
if folder_path:
self.input_path_text.setText(folder_path)
def select_export_folder(self):
folder_path = QFileDialog.getExistingDirectory(self, '选择导出 log 的文件夹')
if folder_path:
self.export_path_text.setText(folder_path)
def check_inputs(self):
flag_input = self.input_path_text.text().strip() != ""
flag_export = self.export_path_text.text().strip() != ""
self.start_button.setEnabled(flag_input and flag_export)
def update_time_diff(self):
start_time = self.datetime_start.dateTime()
end_time = self.datetime_end.dateTime()
time_diff = start_time.secsTo(end_time) / 60
self.time_diff_input.setText(str(int(time_diff)))
def update_end_time_from_diff(self):
try:
time_diff_minutes = int(self.time_diff_input.text())
start_time = self.datetime_start.dateTime()
new_end_time = start_time.addSecs(time_diff_minutes * 60)
self.datetime_end.setDateTime(new_end_time)
except ValueError:
pass
def merge_csv(self):
input_path = self.input_path_text.text().strip()
export_path = self.export_path_text.text().strip()
start_time = self.datetime_start.dateTime().toPyDateTime()
end_time = self.datetime_end.dateTime().toPyDateTime()
csv_files = []
for root, dirs, files in os.walk(input_path):
for file in files:
if file.endswith('.csv'):
csv_files.append(os.path.join(root,file))
if not csv_files:
QMessageBox.warning(self,"提示","没有找到.csv 文件")
return
combined_df = pd.DataFrame()
total_files = len(csv_files)
self.progress_bar.setMaximum(total_files)
self.progress_bar.setValue(0)
self.status_bar.showMessage("正在处理 CSV 文件...")
for index, csv_file in enumerate(csv_files, start=1):
try:
df = pd.read_csv(csv_file)
df['Source File'] = csv_file
df['DATE_TIME'] = pd.to_datetime(
df['DATE_TIME'].str.extract(r'\[(\d{4}/\d{2}/\d{2} \d{2}:\d{2}:\d{2})\]')[0],
format='%Y/%m/%d %H:%M:%S',
errors='coerce'
)
combined_df = pd.concat([combined_df, df], ignore_index=True)
except Exception as e:
QMessageBox.warning(self, "读取错误", f"读取文件失败:{csv_file}\n\n 错误信息:{str(e)}")
self.progress_bar.setValue(index) # 更新进度条
QApplication.processEvents() # 刷新界面
filtered_df = combined_df[(combined_df['DATE_TIME']>=start_time)&(combined_df['DATE_TIME']<=end_time)]
filtered_df = filtered_df.sort_values(by='DATE_TIME')
now = datetime.now()
timestamp = now.strftime("%Y%m%d_%H%M")
filename = f"filtered_log_{timestamp}.csv"
filtered_df.to_csv(os.path.join(export_path,filename),index=False)
self.status_bar.showMessage("完成!", 3000)
QMessageBox.information(self, "完成", "已成功导出")
if __name__ == '__main__':
app = QApplication(sys.argv)
window = CSV_Filter()
window.show()
sys.exit(app.exec_())
但测试的时候发现 csv 数据很不规范
随便抽一条当个例子:
"37929","301","00 40 00 00 00 B9 30 30 3A 30 30 3A 30 32 3A 31
31 33 20 28 32 34 34 30 29 56 20 65 76 65 6E 74
20 36 35 30 20 70 75 62 6C 69 63 3A 38 2C 31 20
30 20 22 64 69 73 6B 3A 38 2C 30 22 20 22 22 0A ","[2025/02/20 12:00:51]","9250","DATA LOG","00:00:02:113 (2440)V event 650 public:8,1 0 "disk:8,0" ""
"
数据应该是 7 列,但是读取到这里就会识别成 8 列然后报错. 我考虑过逐行读取不进行分列,只在其中用正则表达式抽选时间戳新增一列作为筛选的标准. 但因为原始数据中存在换行,这一条数据会被作为好几行读取,导致抽取时的损失
Python 新手,在 Chatgpt 的帮助下完成的,实在没办法了,有没有数据大手子帮忙看看
这是一个专为移动设备优化的页面(即为了让你能够在 Google 搜索结果里秒开这个页面),如果你希望参与 V2EX 社区的讨论,你可以继续到 V2EX 上打开本讨论主题的完整版本。
V2EX 是创意工作者们的社区,是一个分享自己正在做的有趣事物、交流想法,可以遇见新朋友甚至新机会的地方。
V2EX is a community of developers, designers and creative people.