????jsoup????html
???????????? ???????[ 2016/12/29 10:31:19 ] ??????????????????? html
????1??jsoup???
????jsoup ????? Java??HTML???????????????????URL?????HTML????????????????????????API???????DOM??CSS?????????jQuery????????????????????????jsoup ????????????£?
??????1??????? URL?????????????н???HTML??
??????2?????DOM??CSS????????????????????
??????3???????HTML??????????????
????2??maven????
????<!-- jsoup -->
????<dependency>
????<groupId>org.jsoup</groupId>
????<artifactId>jsoup</artifactId>
????<version>1.7.3</version>
????</dependency>
????3?????????HTML???
????3.1 ???????????
????String html = "<html><head><title>First parse</title></head>" + "<body><p>Parsed HTML into a doc.</p></body></html>";
????<!-- ????????????Document????????ó?html??????????? -->
????Document doc = Jsoup.parse(html);
????3.2 ??URL???????
????Document doc = Jsoup.connect("http://example.com/").get();
????String title = doc.title();
??????1?????
????connect(String url)????????????μ? Connection?? ??get()??ú???????HTML???????????URL???HTML??????????????? IOException???????????Connection ???????????????????????????????????£?
????Document doc = Jsoup.connect("http://www.oschina.net/")
????.data("query"?? "Java") // ???????
????.userAgent("I ’ m jsoup") // ???? User-Agent
????.cookie("auth"?? "token") // ???? cookie
????.timeout(3000) // ?????????????
????.post(); // ??? POST ???????? URL
???????????????Web URLs (http??https Э??); ?????????????????????????????parse(File in?? String charsetName) ???檔
????3.3 ?????????
????File input = new File("/tmp/input.html");
????Document doc = Jsoup.parse(input?? "UTF-8"?? "http://www.oschina.net/");
??????1?????
????parse???????????????????????????????????????????????????HTML????л??к??????????????????????????????css ????????????????? baseURL ????????????? HTML ?????????·?????????????????jsoup ????????Щ URL ????????????????? baseURL??
???????? ??<a href=/project> ?????? </a> ??????? <a href=http://www.oschina.net/project> ?????? </a>??
????4?????????HTML???
????4.1 Document??????????
????4.1.1 ??????????
????getElementById(String id)
????getElementByTag(String tag)
????getElementByClass(String className)
????getElementByAttribute(String key)
????siblingElements()?? firstElementSibling()?? lastElementSibling()?? nextElementSibling()?? previousElementSibling()
????parent()?? children()?? child(int index)
????4.1.2 ??????????
????attr(String key) – ???key????
????attributes() – ???????
????id()?? className()?? classNames()
????text() – ??????????
????html() – ?????????HTML????
????outerHtml() – ?????????????HTML????
????data() – ???<srcipt>??<style>????е?????
????tag()?? tagName()
????4.2 ???????????????jsoup??????????????????????????????jquery???????????????????????????????
??????
???·???
??????????????????
2023/3/23 14:23:39???д?ò??????????
2023/3/22 16:17:39????????????????????Щ??
2022/6/14 16:14:27??????????????????????????
2021/10/18 15:37:44???????????????
2021/9/17 15:19:29???·???????·
2021/9/14 15:42:25?????????????
2021/5/28 17:25:47??????APP??????????
2021/5/8 17:01:11