package egor.robot;

import java.io.ByteArrayInputStream;
import java.io.EOFException;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.Reader;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.HashSet;
import java.util.Iterator;
import java.util.logging.Level;
import java.util.logging.Logger;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.cli.ParseException;
import org.apache.commons.cli.PosixParser;
import org.apache.http.Header;
import org.apache.xerces.xni.parser.XMLDocumentFilter;
import org.apache.xerces.xni.parser.XMLInputSource;
import org.cyberneko.html.HTMLConfiguration;

/* loaded from: input_file:egor/robot/HtmlParser.class */
public class HtmlParser {
    private static final Logger LOG = Logger.getLogger(HtmlParser.class.getName());

    public static HashSet<String> extract(String str, Reader reader, boolean z, boolean z2) throws IOException, URISyntaxException {
        HTMLConfiguration hTMLConfiguration = new HTMLConfiguration();
        XMLDocumentFilter linkExtractor = new LinkExtractor(z, z2);
        hTMLConfiguration.setProperty("http://cyberneko.org/html/properties/filters", new XMLDocumentFilter[]{new Simplifier(), linkExtractor});
        hTMLConfiguration.setFeature("http://cyberneko.org/html/features/scanner/ignore-specified-charset", true);
        hTMLConfiguration.parse(new XMLInputSource((String) null, str, (String) null, reader, (String) null));
        URI uri = new URI(str);
        if (linkExtractor.getBaseUri() != null) {
            try {
                uri = uri.resolve(linkExtractor.getBaseUri());
            } catch (Exception e) {
                LOG.log(Level.INFO, "cannot resolve {0} with base {1}", new Object[]{str, linkExtractor.getBaseUri()});
            }
        }
        HashSet<String> hashSet = new HashSet<>();
        Iterator<String> it = linkExtractor.items().iterator();
        while (it.hasNext()) {
            try {
                hashSet.add(uri.resolve(it.next()).toString());
            } catch (Exception e2) {
                LOG.log(Level.INFO, "cannot resolve {0} with href {1}", new Object[]{str, linkExtractor.getBaseUri()});
            }
        }
        return hashSet;
    }

    public static void main(String[] strArr) throws IOException, URISyntaxException {
        Options options = new Options();
        Option option = new Option("help", "print this message");
        OptionBuilder.isRequired();
        OptionBuilder.withLongOpt("data-file");
        OptionBuilder.withArgName("FILENAME");
        OptionBuilder.hasArg();
        OptionBuilder.withDescription("input from the crawler");
        Option create = OptionBuilder.create("i");
        OptionBuilder.withLongOpt("extract-ahrefs");
        OptionBuilder.withDescription("extract A-HREFs");
        Option create2 = OptionBuilder.create("a");
        OptionBuilder.withLongOpt("extract-imgsrc");
        OptionBuilder.withDescription("extract IMG-SRCs");
        Option create3 = OptionBuilder.create("m");
        options.addOption(option);
        options.addOption(create);
        options.addOption(create2);
        options.addOption(create3);
        try {
            CommandLine parse = new PosixParser().parse(options, strArr);
            if (parse.hasOption(option.getOpt())) {
                help(options);
                return;
            }
            String optionValue = parse.getOptionValue(create.getOpt());
            boolean hasOption = parse.hasOption(create2.getOpt());
            boolean hasOption2 = parse.hasOption(create3.getOpt());
            try {
                DumpInputStream dumpInputStream = new DumpInputStream(optionValue);
                Throwable th = null;
                while (true) {
                    try {
                        try {
                            String readUrl = dumpInputStream.readUrl();
                            System.err.printf("%2$d %1$s%n", readUrl, Integer.valueOf(dumpInputStream.readStatus()));
                            String str = "UTF-8";
                            for (Header header : dumpInputStream.readHeaders()) {
                                if ("Content-Type".equalsIgnoreCase(header.getName())) {
                                    for (String str2 : header.getValue().split(";")) {
                                        String trim = str2.trim();
                                        if (!"text/html".equalsIgnoreCase(trim)) {
                                            String lowerCase = trim.toLowerCase();
                                            if (lowerCase.startsWith("charset=")) {
                                                str = lowerCase.substring(8);
                                            }
                                        }
                                    }
                                }
                            }
                            byte[] readData = dumpInputStream.readData();
                            if (readData != null) {
                                Iterator<String> it = extract(readUrl, new InputStreamReader(new ByteArrayInputStream(readData), str), hasOption, hasOption2).iterator();
                                while (it.hasNext()) {
                                    System.out.printf("0 %s%n", it.next());
                                }
                            }
                        } catch (Throwable th2) {
                            th = th2;
                            throw th2;
                        }
                    } catch (Throwable th3) {
                        if (dumpInputStream != null) {
                            if (th != null) {
                                try {
                                    dumpInputStream.close();
                                } catch (Throwable th4) {
                                    th.addSuppressed(th4);
                                }
                            } else {
                                dumpInputStream.close();
                            }
                        }
                        throw th3;
                    }
                }
            } catch (EOFException e) {
            }
        } catch (ParseException e2) {
            help(options);
        }
    }

    private static void help(Options options) {
        new HelpFormatter().printHelp(HtmlParser.class.getName(), options, true);
    }
}
