简单的TFIDF算法实现Java代码

    xiaoxiao2021-11-30  23

    1、代码可以读取某个目录下的全部文件进行计算,因此将需要计算的文件放到某个目录下,然后修改代码中的路径即可

    2、这份代码将文件目录写死,若想降低耦合程度,还可以将路径信息抽取出来,改造成读取xml文件的方式

    3、计算后可以得到每个关键词在每个文件中的TFIDF值,并将结果输出到与输入文件同目录下的result.txt中

    代码奉上:

    package com.tt.test; import java.io.BufferedReader; import java.io.BufferedWriter; import java.io.File; import java.io.FileReader; import java.io.FileWriter; import java.io.IOException; public class Tfidf { /** * 简单的TFIDF算法实现代码 * 输入:某个目录下的所有英文文档 * 输出:计算得到的TFIDF值 */ public static long TOTALDOC = 0; public static int KEYINDEX = 0; public static int FILEINDEX = 0; public static String FILENAMES[] = new String[100]; public static String KEYWORD[] = new String[10000]; public static double IDF[] = new double[10000]; public static double RESULT[][] = new double[100][10000]; private static void init(String path) { File file = new File(path); File[] tempList = file.listFiles(); for (int i = 0; i < tempList.length; i++) { if (tempList[i].isFile()) { TOTALDOC++; getKeyWords(tempList[i].getAbsolutePath()); } } System.out.println("文件数为" + TOTALDOC); } public static void getKeyWords(String fileName) { String thisfile = ""; File file = new File(fileName); BufferedReader reader = null; try { reader = new BufferedReader(new FileReader(file)); String tempString = null; int line = 1; while ((tempString = reader.readLine()) != null) { thisfile += tempString; } String temp[] = thisfile .split(" |、|,|。|;|?|!|,|\\.|;|\\?|!|\"|\\'|\\-|\\(|\\)|]"); for (int i = 0; i < KEYINDEX; i++) { int j; for (j = 0; j < temp.length; j++) { if (temp[j].equals(KEYWORD[i])) break; } if (j < temp.length) IDF[i]++; } boolean flag = false; for (String s : temp) { if (s == null || s.equals("")) continue; flag = false; for (int i = 0; i < KEYINDEX; i++) { if (s.equals(KEYWORD[i])) flag = true; } if ((!flag) && (s != null) && (!s.equals(""))) { // System.out.println("got: "+s); KEYWORD[KEYINDEX] = s; IDF[KEYINDEX] = 1; KEYINDEX++; } } // for(int i=0;i<KEYINDEX;i++){ // System.out.println(KEYWORD[i]+" "+IDF[i]); // } reader.close(); } catch (IOException e) { e.printStackTrace(); } finally { if (reader != null) { try { reader.close(); } catch (IOException e1) { } } } } private static void getIDF() { for (int i = 0; i < KEYINDEX; i++) { IDF[i] = Math.log(TOTALDOC * 1.0 / IDF[i]); if (IDF[i] == 0) KEYWORD[i] = "***"; } } private static void getTF() { for (int i = 0; i < TOTALDOC; i++) { for (int j = 0; j < KEYINDEX; j++) { RESULT[i][j] = IDF[j] * RESULT[i][j]; RESULT[i][j] = (int) (RESULT[i][j] * 10000) / 10000.0; } } } private static void preTF(String path) { for (int i = 0; i < 100; i++) { for (int j = 0; j < 10000; j++) { RESULT[i][j] = 0; } } File file = new File(path); File[] tempList = file.listFiles(); for (int i = 0; i < tempList.length; i++) { if (tempList[i].isFile()) { getRate(tempList[i].getAbsolutePath()); FILENAMES[FILEINDEX] = tempList[i].getName(); FILEINDEX++; } } } private static void getRate(String fileName) { String thisfile = ""; File file = new File(fileName); BufferedReader reader = null; try { reader = new BufferedReader(new FileReader(file)); String tempString = null; int line = 1; while ((tempString = reader.readLine()) != null) { thisfile += tempString; } String temp[] = thisfile .split(" |、|,|。|;|?|!|,|\\.|;|\\?|!|\"|\\'|\\-|\\(|\\)|]"); double max = -1; for (int i = 0; i < KEYINDEX; i++) { if (KEYWORD[i].equals("***")) continue; for (String s : temp) { if (KEYWORD[i].equals(s)) { RESULT[FILEINDEX][i]++; } } if (RESULT[FILEINDEX][i] > max) max = RESULT[FILEINDEX][i]; } for (int i = 0; i < KEYINDEX; i++) { RESULT[FILEINDEX][i] = RESULT[FILEINDEX][i] * 1.0 / max; } } catch (IOException e) { e.printStackTrace(); } finally { if (reader != null) { try { reader.close(); } catch (IOException e1) { } } } } public static void main(String args[]) { init("D:/"); getIDF(); preTF("D:/"); getTF(); try { BufferedWriter out = new BufferedWriter(new FileWriter("D:/result.txt")); out.write(KEYINDEX); out.write("\t\t"); for (int i = 0; i < KEYINDEX; i++) { out.write(KEYWORD[i] + "\t"); } out.write("\n"); for (int i = 0; i < TOTALDOC; i++) { out.write(FILENAMES[i] + "\t"); for (int j = 0; j < KEYINDEX; j++) { out.write(RESULT[i][j] + "\t"); } out.write("\n"); } out.flush(); out.close(); } catch (IOException e) { e.printStackTrace(); } } }

    转载请注明原文地址: https://ju.6miu.com/read-679212.html

    最新回复(0)