package com.freedville.nlprules; import java.io.File; import java.io.FileInputStream; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import org.apache.commons.io.IOUtils; public class ProgrammingLanguageRulesBasedAnnotator { public void readFile(String filename) throws Exception { List sentences = getSentences(filename); for(String sentence : sentences) { //Start of rule. In any sentence with the word language... if(sentence.toLowerCase().contains("language")) { List languages = new ArrayList<>(); String[] words = getWords(sentence); //Any capitalized word (except the first) is a language for(int i=1; i 0 && Character.isUpperCase(words[i].charAt(0))) { languages.add(words[i]); } } //Print extracted results with context. if(!languages.isEmpty()) { System.out.println(sentence.trim() + "\n\t" + languages + "\n"); } } } } private List getSentences(String filename) throws Exception { List sentences = new ArrayList(); List lines = IOUtils.readLines(new FileInputStream(new File(filename))); for(String line : lines) { //Note: Overly simplistic sentence detection. sentences.addAll(Arrays.asList(line.split("\\."))); } return sentences; } private String[] getWords(String sentence) { //Note: overly simplistic tokenization. return sentence.trim().split("[ ,:()]"); } public static void main(String[] args) throws Exception { if(args.length == 0) { throw new IllegalArgumentException("Please pass a file name!"); } new ProgrammingLanguageRulesBasedAnnotator().readFile(args[0]); } }