编辑代码

import pdfplumber
import pandas as pd

# 初始化数据列表
data = []

# 打开PDF文件
with pdfplumber.open('design_classes.pdf') as pdf:
    for page in pdf.pages:
        # 提取左栏和右栏文本
        left_text = page.extract_text(region=[0, 0, page.width / 2, page.height])
        right_text = page.extract_text(region=[page.width / 2, 0, page.width, page.height])
        
        # 处理左栏文本
        lines = left_text.split('\n')
        for i in range(len(lines)):
            if lines[i].strip().startswith(('A', 'B', 'C', 'D', 'E', 'F')):
                code = lines[i].strip()
                school_group = lines[i + 1].strip() if i + 1 < len(lines) else ''
                enrollment = lines[i + 2].strip().split()[0] if i + 2 < len(lines) else ''
                majors = []
                tuition = []
                j = i + 3
                while j < len(lines) and not lines[j].strip().startswith(('A', 'B', 'C', 'D', 'E', 'F')):
                    major_info = lines[j].strip()
                    tuition_info = lines[j].split(';')[-1].strip() if ';' in lines[j] else ''
                    majors.append(major_info)
                    tuition.append(tuition_info)
                    j += 1
                majors_str = '; '.join([f"{m.split(';')[0]} {m.split(';')[1]} ({m.split(';')[2]})" for m in majors])
                tuition_range = f"{min(tuition, default='').split('/')[0]}-{max(tuition, default='').split('/')[0]}" if tuition else ''
                data.append([code, school_group, enrollment, majors_str, tuition_range])

# 创建DataFrame
df = pd.DataFrame(data, columns=['专业组代码', '学校名+组别', '招生人数', '专业信息', '学费区间'])

# 保存到Excel文件
df.to_excel('design_classes.xlsx', index=False)