Skip to content

Commit

Permalink
improve performance, logs, allow to scrap specific fonds
Browse files Browse the repository at this point in the history
  • Loading branch information
don-vip committed Jan 27, 2018
1 parent 2e4cd91 commit b3da78d
Show file tree
Hide file tree
Showing 4 changed files with 50 additions and 35 deletions.
61 changes: 36 additions & 25 deletions src/main/java/com/github/donvip/archscrap/ArchScrap.java
Original file line number Diff line number Diff line change
Expand Up @@ -85,16 +85,22 @@ public static void main(String[] args) {
}

public void doScrap(String[] args) throws IOException {
LOGGER.info("Fetching all image fonds from archives website...");
Element plan = fetch("web_fondsmcadre/34/ILUMP458").select("#planclassement").first();
if (plan != null) {
Elements allFonds = plan.select("p > a");
LOGGER.info("Found {} fonds", allFonds.size());
for (Element fonds : allFonds) {
handleFonds(fonds);
if (args.length <= 1) {
LOGGER.info("Fetching all image fonds from archives website...");
Element plan = fetch("web_fondsmcadre/34/ILUMP458").select("#planclassement").first();
if (plan != null) {
Elements allFonds = plan.select("p > a");
LOGGER.info("Found {} fonds", allFonds.size());
for (Element fonds : allFonds) {
handleFonds(fonds);
}
} else {
LOGGER.error("Unable to fetch image fonds from archives website");
}
} else {
LOGGER.error("Unable to fetch image fonds from archives website");
for (String cote : args[1].split(",")) {
scrapFonds(cote);
}
}
}

Expand All @@ -114,35 +120,40 @@ private void handleFonds(Element fonds) throws IOException {
String fondsText = fonds.text();
Matcher m = Pattern.compile("(\\d+[A-Z][a-z]+) - (.+)").matcher(fondsText);
if (m.matches()) {
String cote = m.group(1);
Fonds f = searchFonds(cote);
if (f.getNotices().size() < f.getExpectedNotices()) {
// We have less notices in database than expected
// 1. Try to fetch missing notices
for (int i : f.getMissingNotices(session)) {
searchNotice(f, i);
}
// 2. Try to search new ones
int last = f.getNotices().isEmpty() ? 0 : f.getNotices().get(f.getNotices().size() - 1).getId();
for (int i = last + 1; i < f.getExpectedNotices(); i++) {
searchNotice(f, i);
}
}
scrapFonds(m.group(1));
} else {
LOGGER.warn("Unable to parse fonds {}", fondsText);
}
}

private void scrapFonds(String cote) throws IOException {
Fonds f = searchFonds(cote);
if (f.getNotices().size() < f.getExpectedNotices()) {
// We have less notices in database than expected
// 1. Try to fetch missing notices
for (int i : f.getMissingNotices(session)) {
searchNotice(f, i);
}
// 2. Try to search new ones
int last = f.getNotices().isEmpty() ? 0 : f.getNotices().get(f.getNotices().size() - 1).getId();
for (int i = last + 1; i <= f.getExpectedNotices(); i++) {
searchNotice(f, i);
}
}
}

private Fonds searchFonds(String cote) throws IOException {
// Check to be sure, we don't have it in database
session.beginTransaction();
Fonds f = session.get(Fonds.class, cote);
session.getTransaction().commit();
if (f == null) {
f = createNewFonds(cote);
session.beginTransaction();
session.save(f);
session.getTransaction().commit();
if (f != null) {
session.beginTransaction();
session.save(f);
session.getTransaction().commit();
}
}
return f;
}
Expand Down
12 changes: 8 additions & 4 deletions src/main/java/com/github/donvip/archscrap/Parser.java
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,10 @@
public class Parser {

private static final Logger LOGGER = LogManager.getLogger();

static {
java.util.logging.Logger.getLogger("HeidelTimeStandalone").setLevel(java.util.logging.Level.WARNING);
}

// -- HeidelTime
private static final HeidelTimeStandalone timeNarrative = new HeidelTimeStandalone(
Expand Down Expand Up @@ -189,6 +193,9 @@ static void extractDate(String text, final Notice n) {
// http://www.timeml.org/publications/timeMLdocs/timeml_1.2.1.html#timex3
Timex3 t = (Timex3) iterTimex.next();
String v = t.getTimexValue();
if (v.startsWith("XXXX-")) {
continue;
}
switch (t.getTimexType()) {
case "DATE":
switch (v.length()) {
Expand All @@ -215,12 +222,9 @@ static void extractDate(String text, final Notice n) {
throw new UnsupportedOperationException(v);
}
case "DURATION":
continue;
case "TIME":
if (v.startsWith("XXXX-XX-XXT")) {
continue;
}
case "SET":
continue;
default:
throw new UnsupportedOperationException(t.getTimexType()+" / "+t.getTimexValue());
}
Expand Down
8 changes: 4 additions & 4 deletions src/main/resources/config.windows.props
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,13 @@
considerDate = true

# Duration
considerDuration = true
considerDuration = false

# Set
considerSet = true
considerSet = false

# Time
considerTime = true
considerTime = false

# Temponyms (make sure you know what you do if you set this to "true")
considerTemponym = false
Expand All @@ -21,7 +21,7 @@ considerTemponym = false
# Path to TreeTagger home directory
###################################
# Ensure there is no white space in path (try to escape white spaces)
treeTaggerHome = C:\\Users\\vippy\\eclipse-workspace\\ArchScrap\\TreeTagger
treeTaggerHome = C:\\GIT\\ArchScrap\\TreeTagger
# This one is only necessary if you want to process chinese documents.
chineseTokenizerPath = SET ME IN CONFIG.PROPS! (e.g., /home/jannik/treetagger/chinese-tokenizer)

Expand Down
4 changes: 2 additions & 2 deletions src/main/resources/log4j2.xml
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,14 @@
<Console name="Console" target="SYSTEM_OUT">
<PatternLayout pattern="%d{HH:mm:ss.SSS} [%t] %-5level %logger{36} - %msg%n" />
</Console>
<File name="MyFile" fileName="all.log" immediateFlush="false" append="false">
<File name="fileAppender" fileName="all.log" immediateFlush="false" append="false">
<PatternLayout pattern="%d{yyy-MM-dd HH:mm:ss.SSS} [%t] %-5level %logger{36} - %msg%n"/>
</File>
</Appenders>
<Loggers>
<Root level="info">
<AppenderRef ref="Console" />
<AppenderRef ref="MyFile"/>
<AppenderRef ref="fileAppender"/>
</Root>
</Loggers>
</Configuration>

0 comments on commit b3da78d

Please sign in to comment.