package com.mini.framework.third.resourse.article.block;

import java.io.IOException;
import java.util.Arrays;
import java.util.Optional;

import org.apache.commons.collections.CollectionUtils;
import org.apache.commons.lang3.StringUtils;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.helper.HttpConnection;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

import com.mini.framework.core.exception.HandleIOException;
import com.mini.framework.third.resourse.article.block.model.ArticleRawContent;
import com.mini.framework.util.asserts.AssertUtil;

/**
 * 微信公众号文档解析
 * https://mp.weixin.qq.com/s/wTJPhhE4l6Lb_uyzuTwt7w
 * @author jayheo
 *
 */
public class WeixinPublicArticleAnalysis implements ResourseAnalysis{


	@Override
	public String analysisKey() {
		return "weixinPublicDocument";
	}

	@Override
	public boolean urlResourseMatch(String url) {
		AssertUtil.assertMethodRequire(url, "url");
		String regex = "https://mp\\.weixin\\.qq\\.com/s/.*";
		return url.matches(regex);
	}
	

	@Override
	public ArticleRawContent analysis(String url) {

        ArticleRawContent articleRawContent = new ArticleRawContent();

        String docId = Arrays.stream(url.split("/s/")).filter(v -> !v.startsWith("http")).findFirst().orElse("");
        Connection connection = Jsoup.connect(url).userAgent(HttpConnection.DEFAULT_UA);
        Document document;

        try {
            document = connection.timeout(100000).get();
        } catch (IOException e) {
            throw new HandleIOException(e, "文章请求超时:[%s]", url);
        }

        String title = Optional.ofNullable(document.select(".rich_media_title"))
                .map(Elements::first)
                .map(Element::text)
                .orElse("标题未采集到");

        String account = Optional.ofNullable(document.select("#js_name"))
                .map(Elements::first)
                .map(Element::text)
                .orElse("处理公众号名称未采集到");

        Elements select = document.select("#js_content").select("p,figure,pre,h4,h5,table");
        select.forEach(element -> {
            Elements imgs = element.select("img");
            if (CollectionUtils.isNotEmpty(imgs)) {
                String src = imgs.attr("data-src");
                String type = imgs.attr("data-type");
                if (StringUtils.isNotBlank(src) && StringUtils.isNotBlank(type)) {
                    articleRawContent.createContent("img", src, type);
                }
            }
            String text = element.text();
            if (StringUtils.isNotBlank(text)) {
                articleRawContent.createContent(element.tagName(), text);
            }

            articleRawContent.setId(docId);
            articleRawContent.setAccount(account);
            articleRawContent.setTitle(title);
        });

/*        articleRawContent.getContents().forEach(v -> {
            if (v.getType().equals("img")) {
                v.setValue(ossHelper.assemServerHome(ossHelper.uploadRes(v.getValue(), "." + v.getSuffix())));
            }
        });*/

        return articleRawContent;
    }
}
