import pdfplumber
import pandas as pd
data = []
with pdfplumber.open('design_classes.pdf') as pdf:
for page in pdf.pages:
left_text = page.extract_text(region=[0, 0, page.width / 2, page.height])
right_text = page.extract_text(region=[page.width / 2, 0, page.width, page.height])
lines = left_text.split('\n')
for i in range(len(lines)):
if lines[i].strip().startswith(('A', 'B', 'C', 'D', 'E', 'F')):
code = lines[i].strip()
school_group = lines[i + 1].strip() if i + 1 < len(lines) else ''
enrollment = lines[i + 2].strip().split()[0] if i + 2 < len(lines) else ''
majors = []
tuition = []
j = i + 3
while j < len(lines) and not lines[j].strip().startswith(('A', 'B', 'C', 'D', 'E', 'F')):
major_info = lines[j].strip()
tuition_info = lines[j].split(';')[-1].strip() if ';' in lines[j] else ''
majors.append(major_info)
tuition.append(tuition_info)
j += 1
majors_str = '; '.join([f"{m.split(';')[0]} {m.split(';')[1]} ({m.split(';')[2]})" for m in majors])
tuition_range = f"{min(tuition, default='').split('/')[0]}-{max(tuition, default='').split('/')[0]}" if tuition else ''
data.append([code, school_group, enrollment, majors_str, tuition_range])
df = pd.DataFrame(data, columns=['专业组代码', '学校名+组别', '招生人数', '专业信息', '学费区间'])
df.to_excel('design_classes.xlsx', index=False)