publicstaticclassPdfHelper { publicstaticvoidPdf2Json(FileInfo pdffile, FileInfo txtfile) { StreamWriter swPdfChange = new StreamWriter(txtfile.FullName, false, Encoding.GetEncoding("utf-8")); var pdfContentStringBuilder = new StringBuilder(); // Step1: pdf讀取 using (PdfDocument doc = PdfDocument.Open(pdffile.FullName)) { foreach (Page page in doc.GetPages()) { string text = page.Text; pdfContentStringBuilder.Append(text); } } var pdfContent = pdfContentStringBuilder.ToString(); // Step2: 取得題目清單 var questionList = GetQuestionList(pdfContent); // Step3: 取得有結構化過的題目選項物件清單 var questionWithOptionsList = GetQuestionWithOptionList(questionList); // Step4: 寫入Json File中 var jsonText = JsonConvert.SerializeObject(questionWithOptionsList); swPdfChange.Write(jsonText); swPdfChange.Close(); } publicstatic List<string> GetQuestionList(string pdfContent) { var answerIndexList = new List<int>(); var questionList = new List<string>(); var regex = new Regex(@"\( \d \) "); // Step1: 蒐集符合答案pattern的Index List foreach (Match match in regex.Matches(pdfContent)) { answerIndexList.Add(match.Index); } // Step2: 利用Step1產生的Index List將完整題目切割 for (int i = 0; i < answerIndexList.Count; i++) { if (i == answerIndexList.Count() - 1) { questionList.Add(pdfContent.Substring(answerIndexList[i])); } else { questionList.Add(pdfContent.Substring(answerIndexList[i], answerIndexList[i + 1] - answerIndexList[i])); } } return questionList; }
publicstatic List<QuestionnaireWithOption> GetQuestionWithOptionList(List<string> questionList) { var questionWithOptionsList = new List<QuestionnaireWithOption>(); foreach (var question in questionList) { var questionWithOptions = new QuestionnaireWithOption(); var answer = 0; Match match = Regex.Match(question, @"\( \d \)"); if (match.Success) { answer = int.Parse(match.Value.Trim('(').Trim(')').Trim()); var questionWithoutAnswer = question.Replace(match.Value, ""); questionWithOptions.Question = questionWithoutAnswer;
var optionIndexList = new List<int>(); var optionRegex = new Regex(@"\(\d\)"); foreach (Match optionMatch in optionRegex.Matches(questionWithoutAnswer)) { optionIndexList.Add(optionMatch.Index); }
for (int i = 0; i < optionIndexList.Count; i++) { var optionStr = questionWithoutAnswer.Substring(optionIndexList[i]); if (i != optionIndexList.Count() - 1) { optionStr = questionWithoutAnswer.Substring(optionIndexList[i], optionIndexList[i + 1] - optionIndexList[i]); }
var sourceFilePath = @"{User}\Downloads\金融市場常識-109.pdf"; var targetFilePath = @"{User}\Downloads\金融市場常識-109.json"; PdfHelper.Pdf2Json(new FileInfo(sourceFilePath) , new FileInfo(targetFilePath));