package edu.cmu.lemurproject;

import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URI;
import java.net.URISyntaxException;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Vector;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/* loaded from: input_file:edu/cmu/lemurproject/WarcHTMLResponseRecord.class */
public class WarcHTMLResponseRecord {
    private static String SINGLE_SPACE = " ";
    private static Pattern ALL_HTML_TAGS = Pattern.compile("<(.*?)>");
    private static Pattern A_HREF_PATTERN = Pattern.compile("[aA].+?[hH][rR][eE][fF]=['\"](.+?)['\"].*?");
    private static Pattern AREA_HREF_PATTERN = Pattern.compile("[aA][rR][eE][aA].+?[hH][rR][eE][fF]=['\"](.*?)['\"].*?");
    private static Pattern FRAME_SRC_PATTERN = Pattern.compile("[fF][rR][aA][mM][eE].+?[sS][rR][cC]=['\"](.*?)['\"].*?");
    private static Pattern IFRAME_SRC_PATTERN = Pattern.compile("[iI][fF][rR][aA][mM][eE].+?[sS][rR][cC]=['\"](.*?)['\"].*?");
    private static Pattern HTTP_START_PATTERN = Pattern.compile("^[hH][tT][tT][pP][sS]?://.*");
    private WarcRecord warcRecord = new WarcRecord();
    private Vector<Pattern> patternSet = new Vector<>();

    public WarcHTMLResponseRecord() {
        createPatternSet();
    }

    public WarcHTMLResponseRecord(WarcHTMLResponseRecord warcHTMLResponseRecord) {
        this.warcRecord.set(warcHTMLResponseRecord.warcRecord);
        createPatternSet();
    }

    public WarcHTMLResponseRecord(WarcRecord warcRecord) {
        if (warcRecord.getHeaderRecordType().compareToIgnoreCase("response") == 0) {
            this.warcRecord.set(warcRecord);
        }
        createPatternSet();
    }

    private void createPatternSet() {
        this.patternSet.add(A_HREF_PATTERN);
        this.patternSet.add(AREA_HREF_PATTERN);
        this.patternSet.add(FRAME_SRC_PATTERN);
        this.patternSet.add(IFRAME_SRC_PATTERN);
    }

    public void setRecord(WarcRecord warcRecord) {
        if (warcRecord.getHeaderRecordType().compareToIgnoreCase("response") == 0) {
            this.warcRecord.set(warcRecord);
        }
    }

    public WarcRecord getRawRecord() {
        return this.warcRecord;
    }

    public String getTargetURI() {
        return this.warcRecord.getHeaderMetadataItem("WARC-Target-URI");
    }

    public String getTargetTrecID() {
        return this.warcRecord.getHeaderMetadataItem("WARC-TREC-ID");
    }

    private String getNormalizedContentURL(String str, String str2) {
        String str3 = str2;
        try {
            URI uri = new URI(str);
            int indexOf = str2.indexOf("?");
            if (indexOf > 0) {
                str3 = str2.substring(0, indexOf);
            }
            return uri.resolve(str3).toString();
        } catch (IllegalArgumentException e) {
            return str3;
        } catch (URISyntaxException e2) {
            return "";
        } catch (Exception e3) {
            return "";
        }
    }

    private HashSet<String> getMatchesOutputSet(Vector<String> vector, String str) {
        HashSet<String> hashSet = new HashSet<>();
        Iterator<String> it = vector.iterator();
        while (it.hasNext()) {
            String next = it.next();
            Iterator<Pattern> it2 = this.patternSet.iterator();
            boolean z = false;
            while (!z && it2.hasNext()) {
                Matcher matcher = it2.next().matcher(next);
                if (matcher.find() && matcher.groupCount() > 0) {
                    String normalizedContentURL = getNormalizedContentURL(str, matcher.group(1));
                    if (HTTP_START_PATTERN.matcher(normalizedContentURL).matches() && !hashSet.contains(normalizedContentURL) && !str.equals(normalizedContentURL)) {
                        hashSet.add(normalizedContentURL);
                        z = true;
                    }
                }
                matcher.reset();
            }
        }
        return hashSet;
    }

    public Vector<String> getURLOutlinks() {
        Vector<String> vector = new Vector<>();
        String targetURI = getTargetURI();
        if (targetURI == null || targetURI.length() == 0) {
            return vector;
        }
        BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(new ByteArrayInputStream(this.warcRecord.getContent())));
        boolean z = true;
        while (z) {
            try {
                String readLine = bufferedReader.readLine();
                if (readLine == null) {
                    break;
                }
                if (readLine.trim().length() == 0) {
                    z = false;
                }
            } catch (IOException e) {
                vector.clear();
            }
        }
        Vector<String> vector2 = new Vector<>();
        while (true) {
            String readLine2 = bufferedReader.readLine();
            if (readLine2 == null) {
                break;
            }
            Matcher matcher = ALL_HTML_TAGS.matcher(readLine2);
            while (matcher.find()) {
                vector2.add(matcher.group(1));
            }
        }
        Iterator<String> it = getMatchesOutputSet(vector2, targetURI).iterator();
        while (it.hasNext()) {
            String next = it.next();
            if (!next.equals(targetURI)) {
                vector.add(next);
            }
        }
        return vector;
    }
}
