你的浏览器不支持canvas

做你害怕做的事情,然后你会发现,不过如此。

Java使用epublib按章节拆分epub电子书

时间: 作者: 黄运鑫

本文章属原创文章,未经作者许可,禁止转载,复制,下载,以及用作商业用途。原作者保留所有解释权。


  • maven引入使用的jar包如下:
<!-- html处理jar包 -->
<dependency>
    <groupId>org.jsoup</groupId>
    <artifactId>jsoup</artifactId>
    <version>1.11.3</version>
</dependency>
<!--epub-->
<dependency>
    <groupId>nl.siegmann.epublib</groupId>
    <artifactId>epublib-core</artifactId>
    <version>3.1</version>
</dependency>
<!-- https://mvnrepository.com/artifact/net.sf.kxml/kxml2 -->
<dependency>
    <groupId>net.sf.kxml</groupId>
    <artifactId>kxml2</artifactId>
    <version>2.3.0</version>
</dependency>
  • 创建MyBook类,继承nl.siegmann.epublib.domain.Book用来接收拆分后的子章节:
import nl.siegmann.epublib.domain.Book;
import java.util.List;

/**
 * 电子书扩展类
 *
 * @author hyx
 */
public class MyBook extends Book {

    /**
     * 电子书的子章节
     */
    List<MyBook> childBookList;

    public List<MyBook> getChildBookList() {
        return childBookList;
    }

    public void setChildBookList(List<MyBook> childBookList) {
        this.childBookList = childBookList;
    }
}
  • 下面是测试方法和具体方法:
import nl.siegmann.epublib.domain.Book;
import nl.siegmann.epublib.domain.Resource;
import nl.siegmann.epublib.domain.TOCReference;
import nl.siegmann.epublib.domain.TableOfContents;
import nl.siegmann.epublib.epub.EpubReader;
import nl.siegmann.epublib.epub.EpubWriter;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Entities;
import org.jsoup.select.Elements;

import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.util.*;

/**
 * 电子书章节表
 *
 * @author hyx
 * @version 1.0
 */
public class Test {

    //需要拆分的电子书地址
    private static String bookPath = "C:\\Users\\hyx\\Desktop\\ebook_b38f0e50140b4deba06ba464d17176a5.epub";
    //电子书拆分后的输出文件夹
    private static String outPath = "C:\\Users\\hyx\\Desktop\\aaa\\";

    public static void main(String[] args) throws Exception {
        //加载电子书
        EpubReader epubReader = new EpubReader();
        Book book = epubReader.readEpub(new FileInputStream(bookPath));
        //按章节拆分
        List<MyBook> myBookList = splitEbook(book);
        for (MyBook myBook : myBookList) {
            saveEbookFile(myBook);
        }
    }

    /**
     * 将拆分的电子书输出到目录
     *
     * @param myBook 拆分后的电子书
     */
    public static void saveEbookFile(MyBook myBook) throws Exception {
        if (myBook != null) {
            File file = new File(outPath + myBook.getTitle() + ".epub");
            EpubWriter epubWriter = new EpubWriter();
            FileOutputStream fileOutputStream = new FileOutputStream(file);
            epubWriter.write(myBook, fileOutputStream);
            //递归保存子章节
            List<MyBook> myBookList = myBook.getChildBookList();
            if (myBookList != null && myBookList.size() > 0) {
                for (MyBook book : myBookList) {
                    saveEbookFile(book);
                }
            }
        }
    }

    /**
     * 拆分章节
     *
     * @param book 需要拆分的电子书
     * @return 拆分后的章节list
     */
    public static List<MyBook> splitEbook(Book book) throws Exception {
        TableOfContents tableOfContents = book.getTableOfContents();
        List<TOCReference> refs = tableOfContents.getTocReferences();

        List<MyBook> list = new ArrayList<>();
        //一级章节
        if (refs != null && refs.size() > 0) {
            for (int i = 0; i < refs.size(); i++) {
                List<Fragment> parentNextFragmentList = new ArrayList<>();
                TOCReference next = null;
                if ((i + 1) < refs.size()) {
                    next = refs.get(i + 1);
                    if (next != null) {
                        //下一章节文件
                        String resourceId = next.getResourceId();
                        //下一章节锚点
                        String fragmentId = next.getFragmentId();
                        if (StringUtils.isNotBlank(fragmentId) && StringUtils.isNotBlank(resourceId)) {
                            parentNextFragmentList.add(new Fragment(resourceId, fragmentId));
                        }
                    }
                }
                TOCReference reference = refs.get(i);
                MyBook newBook = new MyBook();
                //生成一级章节并递归拆分子章节
                setTOCReference(book, newBook, reference, next, parentNextFragmentList);
                list.add(newBook);
            }
        }
        return list;
    }

    /**
     * 生成一级章节并递归拆分子章节
     *
     * @param resourceBook           源电子书
     * @param newBook                生成的电子书
     * @param tocReference           源章节
     * @param nextTocReference       下一章节
     * @param parentNextFragmentList 当前章节父章节的下一章节
     */
    public static TOCReference setTOCReference(Book resourceBook, MyBook newBook, TOCReference tocReference,
                                               TOCReference nextTocReference, List<Fragment> parentNextFragmentList) throws Exception {
        if (parentNextFragmentList == null) {
            parentNextFragmentList = new ArrayList<>();
        }
        // 将指定document中的内容替换
        Resource res = tocReference.getResource();
        Resource r = new Resource(res.getId(), res.getData(), res.getHref(), res.getMediaType());

        //title
        newBook.getMetadata().addTitle(tocReference.getTitle());
        //封面图
        newBook.setCoverImage(resourceBook.getCoverImage());

        //更改章节内容
        String data = new String(r.getData());
        org.jsoup.nodes.Document doc = Jsoup.parse(data);
        //设置为xhtml
        doc.outputSettings().syntax(org.jsoup.nodes.Document.OutputSettings.Syntax.xml).escapeMode(Entities.EscapeMode.xhtml);
        //当前章节锚点
        String fragmentId = tocReference.getFragmentId();
        //当前章节文件
        String resourceId = tocReference.getResourceId();
        //如果当前章节有锚点,则删除锚点之前的标签
        if (StringUtils.isNotBlank(fragmentId)) {
            Element body = doc.body();
            if (body != null) {
                Elements children = body.children();
                if (children != null && children.size() > 0) {
                    for (Element child : children) {
                        String id = child.id();
                        if (fragmentId.equals(id)) {
                            break;
                        }
                        child.remove();
                    }
                }
            }
        }

        //如果下一个章节有锚点,并且和当前章节在同一html,则删除之后章节内容
        if (nextTocReference != null) {
            //下一章节锚点
            String nextFragmentId = nextTocReference.getFragmentId();
            //下一章节文件
            String nextResourceId = nextTocReference.getResourceId();
            if (StringUtils.isNotBlank(nextFragmentId) && resourceId.equals(nextResourceId)) {
                //查找下一章节锚点的标签
                Element elementById = doc.getElementById(nextFragmentId);
                if (elementById != null) {
                    //删除下一章节锚点之后的所有标签
                    while (true) {
                        Element element = elementById.nextElementSibling();
                        if (element == null) {
                            break;
                        }
                        element.remove();
                    }
                    //删除下一章节锚点的标签
                    elementById.remove();
                }
                //之前用select删除,如果内容太大会导致程序卡住,改为while (true)循环删除
//                Elements elements = doc.select("#" + fragmentId + ",#" + fragmentId + " ~ *");\
//                doc.body().html(elements.outerHtml());
            }
        }

        //如果子章节有锚点,并且和当前章节在同一html,则删除子章节之后的内容
        List<TOCReference> childrenList = tocReference.getChildren();
        if (childrenList != null && childrenList.size() > 0) {
            TOCReference childReference = childrenList.get(0);
            if (childReference != null) {
                //子章节锚点
                String childFragmentId = childReference.getFragmentId();
                //子章节文件
                String childResourceId = childReference.getResourceId();
                if (StringUtils.isNotBlank(childFragmentId) && resourceId.equals(childResourceId)) {
                    //查找子章节锚点的标签
                    Element elementById = doc.getElementById(childFragmentId);
                    if (elementById != null) {
                        //删除子章节锚点之后的所有标签
                        while (true) {
                            Element element = elementById.nextElementSibling();
                            if (element == null) {
                                break;
                            }
                            element.remove();
                        }
                        //删除子章节锚点的标签
                        elementById.remove();
                    }
                    //之前用select删除,如果内容太大会导致程序卡住,改为while (true)循环删除
//                Elements elements = doc.select("#" + childFragmentId + ",#" + childFragmentId + " ~ *");\
//                doc.body().html(elements.outerHtml());
                }
            }
        }

        //如果父章节的下一章在同文件,则删除父章节的下一章节
        if (parentNextFragmentList != null) {
            for (Fragment fragment : parentNextFragmentList) {
                if (StringUtils.isBlank(fragment.getResourceId()) || StringUtils.isBlank(fragment.getFragmentId())) {
                    continue;
                }
                String parentNextResourceId = fragment.getResourceId();
                String parentNextFragmentId = fragment.getFragmentId();
                if (resourceId.equals(parentNextResourceId)) {
                    //查找章节锚点的标签
                    Element elementById = doc.getElementById(parentNextFragmentId);
                    if (elementById != null) {
                        //当前章节在同级元素的位置
                        Integer siblingIndex = null;
                        if (StringUtils.isNotBlank(fragmentId)) {
                            Element element = doc.getElementById(fragmentId);
                            if (element != null) {
                                siblingIndex = element.elementSiblingIndex();
                            }
                        }
                        //删除章节在同级元素的位置
                        int parentSiblingIndex = elementById.elementSiblingIndex();
                        if (siblingIndex != null && parentSiblingIndex > siblingIndex) {
                            //删除章节锚点之后的所有标签
                            while (true) {
                                Element element = elementById.nextElementSibling();
                                if (element == null) {
                                    break;
                                }
                                element.remove();
                            }
                            //删除章节锚点的标签
                            elementById.remove();
                        }
                    }
                }
            }
        }

        //更新章节内容
        String html = doc.outerHtml();
        r.setData(html.getBytes());

        //添加章节
        TOCReference addSection = newBook.addSection(tocReference.getTitle(), r);
        //设置锚点
        addSection.setFragmentId(tocReference.getFragmentId());
        //设置css和img
        setCssAndImg(resourceBook, newBook);

        //递归拆分子章节
        List<TOCReference> children = tocReference.getChildren();
        List<MyBook> myBookList = new ArrayList<>();
        if (children != null && children.size() > 0) {
            for (int i = 0; i < children.size(); i++) {
                TOCReference next = null;
                if ((i + 1) < children.size()) {
                    next = children.get(i + 1);
                    if (next != null) {
                        //下一章节文件
                        String parentNextResourceId = next.getResourceId();
                        //下一章节锚点
                        String parentNextFragmentId = next.getFragmentId();
                        if (StringUtils.isNotBlank(parentNextResourceId) && StringUtils.isNotBlank(parentNextFragmentId)) {
                            parentNextFragmentList.add(new Fragment(parentNextResourceId, parentNextFragmentId));
                        }
                    }
                }
                MyBook newBook2 = new MyBook();
                setTOCReference(resourceBook, newBook2, children.get(i), next, parentNextFragmentList);
                myBookList.add(newBook2);
            }
        }
        newBook.setChildBookList(myBookList);
        return addSection;
    }


    /**
     * 给电子书添加内容中出现的样式、图片、超链接
     *
     * @param sourceBook 源电子书
     * @param newBook    生成的新电子书
     */
    public static void setCssAndImg(Book sourceBook, Book newBook) {
        if (sourceBook == null || newBook == null) {
            return;
        }
        try {
            Resource resource = newBook.getTableOfContents().getTocReferences().get(0).getResource();
            org.jsoup.nodes.Document doc = Jsoup.parse(new String(resource.getData()));
            //设置css、img和超链接
            Set<String> cssSet = new HashSet<>();
            Set<String> imgSet = new HashSet<>();
            Set<String> hrefSet = new HashSet<>();
            Elements cssList = doc.getElementsByTag("link");
            Elements imgList = doc.getElementsByTag("img");
            Elements hrefList = doc.getElementsByTag("a");
            for (Element element : cssList) {
                String href = element.attr("href");
                href = href.substring(3);
                cssSet.add(href);
            }
            for (Element element : imgList) {
                String href = element.attr("src");
                href = href.substring(3);
                imgSet.add(href);

            }
            for (Element element : hrefList) {
                String href = element.attr("href");
                href = href.substring(3);
                hrefSet.add(href);

            }
            for (Iterator iterator = cssSet.iterator(); iterator.hasNext(); ) {
                String href = (String) iterator.next();
                Resource res = sourceBook.getResources().getByHref(href);
                if (res != null) {
                    newBook.addResource(res);
                }
            }
            for (Iterator iterator = imgSet.iterator(); iterator.hasNext(); ) {
                String href = (String) iterator.next();
                Resource res = sourceBook.getResources().getByHref(href);
                if (res != null) {
                    newBook.addResource(res);
                }
            }
            for (Iterator iterator = hrefSet.iterator(); iterator.hasNext(); ) {
                String href = (String) iterator.next();
                Resource res = sourceBook.getResources().getByHref(href);
                if (res != null) {
                    newBook.addResource(res);
                }
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
}

对于本文内容有问题或建议的小伙伴,欢迎在文章底部留言交流讨论。