itext与pdfbox都可以做pdf解析工具,下面简单介绍itext与pdfbox坐标定位
itext:
import java.io.IOException; import com.itextpdf.awt.geom.Rectangle2D.Float; import com.itextpdf.text.pdf.PdfReader; import com.itextpdf.text.pdf.parser.ImageRenderInfo; import com.itextpdf.text.pdf.parser.PdfReaderContentParser; import com.itextpdf.text.pdf.parser.RenderListener; import com.itextpdf.text.pdf.parser.TextRenderInfo; public class Demo { // 定义关键字 private static String KEY_WORD = "KEYWORD"; // 定义返回值 private static float[] resu = null; // 定义返回页码 private static int i = 0; /* * 返回关键字所在的坐标和页数 float[0] >> X float[1] >> Y float[2] >> page */ private float[] getKeyWords(String filePath) { try { PdfReader pdfReader = new PdfReader(filePath); int pageNum = pdfReader.getNumberOfPages(); PdfReaderContentParser pdfReaderContentParser = new PdfReaderContentParser( pdfReader); // 下标从1开始 for (i = 1; i < pageNum; i++) { pdfReaderContentParser.processContent(i, new RenderListener() { @Override public void renderText(TextRenderInfo textRenderInfo) { String text = textRenderInfo.getText(); if (null != text && text.contains(KEY_WORD)) { Float boundingRectange = textRenderInfo .getBaseline().getBoundingRectange(); resu = new float[3]; resu[0] = boundingRectange.x; resu[1] = boundingRectange.y; resu[2] = i; } } @Override public void renderImage(ImageRenderInfo arg0) { // TODO Auto-generated method stub } @Override public void endTextBlock() { // TODO Auto-generated method stub } @Override public void beginTextBlock() { // TODO Auto-generated method stub } }); } } catch (IOException e) { // TODO Auto-generated catch block e.printStackTrace(); } return resu; } }
pdfbox坐标定位:根据字符串组合进行定位textPosition
/** * 获得表头表尾坐标 * @param pdfInfoRegular * @param file * @return * @author */ public List<float[]> getTableLoactions(PdfInfoRegular pdfInfoRegular,String file){ List<float[]> position=null;//表头表尾坐标集合 PDDocument document = null; float[] text_Start=null;//表头坐标0->x;1->y;2->行高; float[] text_End=null;//表尾坐标:同上 float[] text_title=null;//表名坐标:同上 float[] text_page=null; try { document = PDDocument.load(file); List allPages = document.getDocumentCatalog().getAllPages(); PDFTextStripperByArea stripper = new PDFTextStripperByArea(); stripper.setSortByPosition(true); for (int i = 0; i < allPages.size(); i++) { text_Start=new float[3]; text_End=new float[3]; text_title=new float[3]; text_page=new float[1]; PrintTextLocatins2 printer = new PrintTextLocatins2(); List<TextPosition> list = new ArrayList<TextPosition>(); PDPage page = (PDPage) allPages.get(i); PDStream contents = page.getContents(); if (contents != null) { printer.processStream(page, page.findResources(), page .getContents().getStream()); } list = printer.getList(); if (null !=list && list.size()>0) { List<TextPosition> text_S=geTextPosition(list,pdfInfoRegular.getstr_TS());//表头头尾坐标信息 List<TextPosition> text_E=geTextPosition(list,pdfInfoRegular.getstr_End());//表尾头尾坐标信息 List<TextPosition> text_T=geTextPosition(list,pdfInfoRegular.getKeyword());//标题头尾坐标 if(text_S!=null&&text_S.size()>0&&text_E!=null&&text_E.size()>0&&text_T!=null&&text_T.size()>0){ position=new ArrayList<>(); //添加开始坐标信息 text_Start[0]=text_S.get(1).getX(); text_Start[1]=text_S.get(1).getY(); text_Start[2]=text_S.get(1).getHeightDir(); position.add(text_Start); //添加结束坐标 text_End[0]=text_E.get(1).getX(); text_End[1]=text_E.get(1).getY(); text_End[2]=text_E.get(1).getHeightDir(); position.add(text_End); //添加标题坐标 text_title[0]=text_T.get(1).getX(); text_title[1]=text_T.get(1).getY(); text_title[2]=text_T.get(1).getHeightDir(); position.add(text_title); //添加page text_page[0]=i; position.add(text_page); // break; } } } return position; }catch (Exception e) { e.printStackTrace(); } return position; } /** * 获得表头表尾坐标 * @param list * @param str表头或者表尾关键字 * @return * @author */ public static List<TextPosition> geTextPosition(List<TextPosition> list,String str) { List<TextPosition> list_xy = new ArrayList<TextPosition>(); for (int i = 0; i < list.size(); i++) { if (str.contains(list.get(i).getCharacter())) { StringBuffer textstr = new StringBuffer(); for (int j = 0; j < str.length(); j++) { if((i+j)<list.size()){ textstr.append(list.get(i + j).getCharacter()); } } if (str.equals(textstr.toString())) { list_xy.add(list.get(i)); list_xy.add(list.get(i + str.length() - 1)); } } } return list_xy; } /** * 获取table坐标信息集合 * @author */ public List<List<float[]>> readTableCoor(String path,List<PdfInfoRegular> list) throws IOException { List<List<float[]>> coors =new ArrayList<>(); for (PdfInfoRegular pir : list) { try { coors.add(getTableLoactions(pir, path)); } catch (Exception e) { e.printStackTrace(); } } return coors; } }
