1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
|
import json
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
import numpy as np
# --- 1. 数据加载与预处理函数 ---
def load_and_preprocess_data(filepath, is_train=True):
"""
加载并预处理JSON文件中的对话数据。
is_train参数用于区分训练集和测试集,因为测试集没有标签。
"""
try:
with open(filepath, 'r', encoding='utf-8') as f:
data = json.load(f)
except FileNotFoundError:
print(f"错误: 文件 {filepath} 未找到。")
return None, None
except json.JSONDecodeError:
print(f"错误: 无法解析 {filepath} 中的JSON数据。")
return None, None
conversations = []
labels = []
for item in data:
conversation_content = item.get('conversation', [])
if isinstance(conversation_content, list):
processed_text = ' '.join(str(turn) for turn in conversation_content if turn is not None)
else:
processed_text = str(conversation_content)
conversations.append(processed_text)
if is_train:
labels.append(item.get('label'))
return conversations, labels if is_train else None
# --- 2. 加载数据集 ---
X_train, y_train = load_and_preprocess_data('train.json', is_train=True)
X_test, _ = load_and_preprocess_data('test.json', is_train=False)
# 如果加载数据失败,则退出
if X_train is None or X_test is None:
print("由于数据加载错误,脚本已终止。")
else:
# --- 3. 建立并优化模型管道 (Pipeline) ---
pipeline = Pipeline([
('tfidf', TfidfVectorizer()),
('classifier', LogisticRegression(max_iter=2000, solver='lbfgs'))
])
# 定义更精细的超参数网格
param_grid = {
'tfidf__max_features': [10000, 20000],
'tfidf__ngram_range': [(1, 2)],
# 新增 stop_words, min_df 和 max_df
'tfidf__stop_words': [None, 'english'],
'tfidf__min_df': [1, 3],
'tfidf__max_df': [0.9, 0.95],
'classifier__C': [0.5, 1.0, 2.0]
}
print("开始网格搜索以寻找最佳超参数...")
print(f"参数组合总数: {len(param_grid['tfidf__max_features']) * len(param_grid['tfidf__ngram_range']) * len(param_grid['tfidf__stop_words']) * len(param_grid['tfidf__min_df']) * len(param_grid['tfidf__max_df']) * len(param_grid['classifier__C'])}")
grid_search = GridSearchCV(
pipeline,
param_grid,
cv=5,
n_jobs=-1,
verbose=1,
scoring='f1_weighted'
)
grid_search.fit(X_train, y_train)
print("\n找到的最佳参数: ", grid_search.best_params_)
print("最佳交叉验证分数: {:.4f}".format(grid_search.best_score_))
# --- 4. 进行预测并保存结果 ---
best_model = grid_search.best_estimator_
predictions = best_model.predict(X_test)
result = ''.join(map(str, predictions))
try:
with open('predictions3.txt', 'w', encoding='utf-8') as f:
f.write(result)
print("\n预测结果已成功保存到 predictions.txt 文件。")
except IOError:
print("错误: 无法写入 predictions.txt 文件。")
|