package com.lucidworks.hadoop.tika;

import com.google.common.base.Strings;
import com.lucidworks.hadoop.io.LWDocument;
import com.lucidworks.hadoop.io.LWDocumentProvider;
import com.lucidworks.hadoop.process.TikaProcess;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.util.AbstractMap;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.UUID;
import org.apache.tika.exception.TikaException;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaMetadataKeys;
import org.apache.tika.parser.AutoDetectParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.parser.Parser;
import org.apache.tika.parser.ParserDecorator;
import org.apache.tika.sax.BodyContentHandler;
import org.apache.tika.sax.LinkContentHandler;
import org.apache.tika.sax.TeeContentHandler;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;

/* loaded from: input_file:com/lucidworks/hadoop/tika/TikaParsing.class */
public class TikaParsing implements TikaProcess {
    public static final String TIKA_INCLUDE_IMAGES = "default.include.images";
    public static final String TIKA_FLATENN_COMPOUND = "default.faltten.compound";
    public static final String TIKA_ADD_FAILED_DOCS = "default.add.failed.docs";
    public static final String TIKA_ADD_ORIGINAL_CONTENT = "default.add.original.content";
    public static final String FIELD_MAPPING_RENAME_UNKNOWN = "default.rename.unknown";
    private static final String RAW_CONTENT = "_raw_content_";
    private static transient Logger log = LoggerFactory.getLogger((Class<?>) TikaParsing.class);
    public static boolean includeImages = false;
    public static boolean flattenCompound = false;
    public static boolean addFailedDocs = false;
    public static boolean addOriginalContent = false;
    public static boolean renameUnknown = false;
    public static int MAX_TERM_LENGTH_UTF = 32766;

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:com/lucidworks/hadoop/tika/TikaParsing$RecursiveMetadataParser.class */
    public static class RecursiveMetadataParser extends ParserDecorator {
        private List<Map.Entry<Metadata, ContentHandler>> nestedFiles;

        public RecursiveMetadataParser(Parser parser) {
            super(parser);
            this.nestedFiles = new ArrayList();
        }

        public List<Map.Entry<Metadata, ContentHandler>> getNestedDocuments() {
            return this.nestedFiles;
        }

        @Override // org.apache.tika.parser.ParserDecorator, org.apache.tika.parser.Parser
        public void parse(InputStream inputStream, ContentHandler contentHandler, Metadata metadata, ParseContext parseContext) throws IOException, SAXException, TikaException {
            BodyContentHandler bodyContentHandler = new BodyContentHandler();
            super.parse(inputStream, bodyContentHandler, metadata, parseContext);
            TikaParsing.log.debug("Begin Document");
            TikaParsing.log.debug("Metadata: ");
            for (String str : metadata.names()) {
                TikaParsing.log.debug("\t" + str + " -> " + Arrays.asList(metadata.getValues(str)));
            }
            TikaParsing.log.debug("End Document");
            this.nestedFiles.add(new AbstractMap.SimpleEntry(metadata, bodyContentHandler));
        }
    }

    private static LWDocument[] parseLWSolrDocument(LWDocument lWDocument, byte[] bArr) {
        RecursiveMetadataParser recursiveMetadataParser = new RecursiveMetadataParser(new AutoDetectParser());
        ParseContext parseContext = new ParseContext();
        parseContext.set(Parser.class, recursiveMetadataParser);
        TeeContentHandler teeContentHandler = new TeeContentHandler(new LinkContentHandler(), new BodyContentHandler());
        ByteArrayInputStream byteArrayInputStream = new ByteArrayInputStream(bArr);
        try {
            try {
                try {
                    try {
                        recursiveMetadataParser.parse(byteArrayInputStream, teeContentHandler, new Metadata(), parseContext);
                        if (byteArrayInputStream != null) {
                            try {
                                byteArrayInputStream.close();
                            } catch (IOException e) {
                                e.printStackTrace();
                            }
                        }
                    } catch (Throwable th) {
                        if (byteArrayInputStream != null) {
                            try {
                                byteArrayInputStream.close();
                            } catch (IOException e2) {
                                e2.printStackTrace();
                            }
                        }
                        throw th;
                    }
                } catch (SAXException e3) {
                    e3.printStackTrace();
                    log.warn("SAXException " + lWDocument.getId());
                    if (byteArrayInputStream != null) {
                        try {
                            byteArrayInputStream.close();
                        } catch (IOException e4) {
                            e4.printStackTrace();
                        }
                    }
                }
            } catch (TikaException e5) {
                e5.printStackTrace();
                log.warn("TikaException " + lWDocument.getId());
                if (byteArrayInputStream != null) {
                    try {
                        byteArrayInputStream.close();
                    } catch (IOException e6) {
                        e6.printStackTrace();
                    }
                }
            }
        } catch (IOException e7) {
            e7.printStackTrace();
            log.warn("IOException " + lWDocument.getId());
            if (byteArrayInputStream != null) {
                try {
                    byteArrayInputStream.close();
                } catch (IOException e8) {
                    e8.printStackTrace();
                }
            }
        }
        ArrayList arrayList = new ArrayList();
        String id = lWDocument.getId();
        for (Map.Entry<Metadata, ContentHandler> entry : recursiveMetadataParser.getNestedDocuments()) {
            LWDocument generateDocument = generateDocument(LWDocumentProvider.createDocument(), entry.getKey(), entry.getValue());
            if (Strings.isNullOrEmpty(id)) {
                log.warn("Can not set id for nested document, [{}]", id);
                generateDocument.setId("directory_ingest_mapper_id_" + UUID.randomUUID());
            } else {
                String str = entry.getKey().get(TikaMetadataKeys.RESOURCE_NAME_KEY);
                String str2 = entry.getKey().get("title");
                if (!Strings.isNullOrEmpty(str)) {
                    generateDocument.setId(id + "#" + str);
                } else if (Strings.isNullOrEmpty(str2)) {
                    log.warn("No resource name or title were found for document: [{}]", id);
                    generateDocument.setId(id + "#" + UUID.randomUUID());
                } else {
                    generateDocument.setId(id + "#" + str2);
                }
            }
            arrayList.add(generateDocument);
        }
        return (LWDocument[]) arrayList.toArray(new LWDocument[arrayList.size()]);
    }

    public static LWDocument generateDocument(LWDocument lWDocument, Metadata metadata, ContentHandler contentHandler) {
        if (metadata != null) {
            for (String str : metadata.names()) {
                lWDocument.addField(str, metadata.get(str));
            }
        }
        if (contentHandler != null) {
            String obj = contentHandler.toString();
            if (obj.length() > MAX_TERM_LENGTH_UTF) {
                obj = obj.substring(0, MAX_TERM_LENGTH_UTF);
            }
            lWDocument.addField("body", obj);
        }
        return lWDocument;
    }

    @Override // com.lucidworks.hadoop.process.TikaProcess
    public LWDocument[] tikaParsing(LWDocument lWDocument) {
        Object firstFieldValue = lWDocument.getFirstFieldValue(RAW_CONTENT);
        if (firstFieldValue instanceof byte[]) {
            return parseLWSolrDocument(lWDocument, (byte[]) firstFieldValue);
        }
        lWDocument.removeField(RAW_CONTENT);
        return new LWDocument[]{lWDocument};
    }
}
