Crawl SGU

mail@pastecode.io avatar
unknown
java
3 years ago
1.4 kB
1
Indexable
Never
import java.io.*;
import java.net.URL;
import java.net.URLConnection;

public class Main {

    public static void main(String[] args) {
        try {
            FileOutputStream fos = new FileOutputStream("crawl.html");
            OutputStreamWriter dos = new OutputStreamWriter(fos, "utf-8");

            long id = 3119410001l;
            while (id < 3119410509l) {
                URL url = new URL("http://thongtindaotao.sgu.edu.vn/Default.aspx?page=xemhocphi&id=" + id);
                URLConnection conn = url.openConnection();
                BufferedReader in = new BufferedReader(new InputStreamReader(conn.getInputStream()));

                dos.write("MSV: " + id);
                String l;
                boolean flag = false;
                while ((l = in.readLine()) != null) {
                    if (l.contains("ctl00_ContentPlaceHolder1_ctl00_gvHocPhi"))
                        flag = true;
                    if (flag && l.contains("</table>")) {
                        dos.write(l + "\n");
                        flag = false;
                    }
                    if (flag || l.contains("ctl00_ContentPlaceHolder1_ctl00_ucThongTinSV_lblTenSinhVien"))
                        dos.write(l + "\n");
                }

                id++;
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
    }

}