package de.uniol.inf.is.odysseus.wrapper.webcrawler.util;

import edu.uci.ics.crawler4j.crawler.Page;
import edu.uci.ics.crawler4j.parser.HtmlParseData;
import edu.uci.ics.crawler4j.url.WebURL;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;

/* loaded from: input_file:de/uniol/inf/is/odysseus/wrapper/webcrawler/util/WebCrawler.class */
public class WebCrawler extends edu.uci.ics.crawler4j.crawler.WebCrawler {
    private static final Pattern FILTERS = Pattern.compile(".*(\\.(css|js|bmp|gif|jpe?g|png|tiff?|mid|mp2|mp3|mp4|wav|avi|mov|mpeg|ram|m4v|pdf|rm|smil|wmv|swf|wma|zip|rar|gz))$");
    private static ArrayList<String> savedText = new ArrayList<>();
    HtmlParseData htmlParseData;

    @Override // edu.uci.ics.crawler4j.crawler.WebCrawler
    public boolean shouldVisit(WebURL webURL) {
        String lowerCase = webURL.getURL().toLowerCase();
        System.out.println("--------ANFANG---------------");
        System.out.println("URL:" + webURL.getURL());
        System.out.println("Depth:" + webURL.getDepth());
        System.out.println("Domain:" + webURL.getDomain());
        System.out.println("ParentURL:" + webURL.getParentUrl());
        System.out.println("Path" + webURL.getPath());
        System.out.println("--------ENDE---------------");
        return !FILTERS.matcher(lowerCase).matches() && lowerCase.startsWith(webURL.getURL());
    }

    @Override // edu.uci.ics.crawler4j.crawler.WebCrawler
    public void visit(Page page) {
        page.getWebURL().getURL();
        if (page.getParseData() instanceof HtmlParseData) {
            this.htmlParseData = (HtmlParseData) page.getParseData();
            String text = this.htmlParseData.getText();
            savedText.add(text);
            String html = this.htmlParseData.getHtml();
            List<WebURL> outgoingUrls = this.htmlParseData.getOutgoingUrls();
            System.out.println("Text length: " + text.length());
            System.out.println("Html length: " + html.length());
            System.out.println("Number of outgoing links: " + outgoingUrls.size());
        }
    }

    public HtmlParseData getParseData() {
        return this.htmlParseData;
    }

    public ArrayList<String> getText() {
        return savedText;
    }

    public void deleteText() {
        savedText.clear();
    }

    @Override // edu.uci.ics.crawler4j.crawler.WebCrawler
    public Object getMyLocalData() {
        return super.getMyLocalData();
    }
}
