2016-03-29 31 views
0

Bu benim projem yapısı şöyledir: enter image description hereNeden log4j2 çalışmadı?

build.gradle

apply plugin: 'java' 
apply plugin: 'eclipse' 
apply plugin: 'idea' 

jar { 
    baseName = 'dcv_crawler_engine' 
    version = '1.0.0-SNAPSHOT' 
} 


repositories { 
    jcenter() 
} 

dependencies {  
    compile 'edu.uci.ics:crawler4j:4.2' 
    compile 'org.apache.logging.log4j:log4j-api:2.5' 

    testCompile 'junit:junit:4.12' 
    testCompile 'edu.uci.ics:crawler4j:4.2' 
    testCompile 'org.apache.logging.log4j:log4j-api:2.5' 
} 

EntryPoint.java

package com.dcvsolution.crawler; 

import edu.uci.ics.crawler4j.crawler.CrawlConfig; 
import edu.uci.ics.crawler4j.crawler.CrawlController; 
import edu.uci.ics.crawler4j.fetcher.PageFetcher; 
import edu.uci.ics.crawler4j.robotstxt.RobotstxtConfig; 
import edu.uci.ics.crawler4j.robotstxt.RobotstxtServer; 

import org.apache.logging.log4j.LogManager; 
import org.apache.logging.log4j.Logger; 

public class EntryPoint { 

    /** 
    * For logging. 
    */ 
    private static final Logger logger = LogManager.getLogger(); 

    public static void main(String[] args) throws Exception { 

     logger.info("Bat dau crawling."); 

     String crawlStorageFolder = "/data/crawl/root"; 
     int numberOfCrawlers = 7; 

     CrawlConfig config = new CrawlConfig(); 
     config.setCrawlStorageFolder(crawlStorageFolder); 

     /* 
     * Instantiate the controller for this crawl. 
     */ 
     PageFetcher pageFetcher = new PageFetcher(config); 
     RobotstxtConfig robotstxtConfig = new RobotstxtConfig(); 
     RobotstxtServer robotstxtServer = new RobotstxtServer(robotstxtConfig, pageFetcher); 
     CrawlController controller = new CrawlController(config, pageFetcher, robotstxtServer); 

     /* 
     * For each crawl, you need to add some seed urls. These are the first 
     * URLs that are fetched and then the crawler starts following links 
     * which are found in these pages 
     */ 
     controller.addSeed("http://www.ics.uci.edu/~lopes/"); 
     controller.addSeed("http://www.ics.uci.edu/~welling/"); 
     controller.addSeed("http://www.ics.uci.edu/"); 

     /* 
     * Start the crawl. This is a blocking operation, meaning that your code 
     * will reach the line after this only when crawling is finished. 
     */ 
     logger.info("Bat dau crawling."); 
     controller.start(MyCrawler.class, numberOfCrawlers);   

    } 

} 

MyCrawler.java

package com.dcvsolution.crawler; 

import java.util.Set; 
import java.util.regex.Pattern; 
import edu.uci.ics.crawler4j.crawler.Page; 
import edu.uci.ics.crawler4j.crawler.WebCrawler; 
import edu.uci.ics.crawler4j.parser.HtmlParseData; 
import edu.uci.ics.crawler4j.url.WebURL; 

import org.apache.logging.log4j.LogManager; 
import org.apache.logging.log4j.Logger; 

public class MyCrawler extends WebCrawler { 

    private static final Logger logger = LogManager.getLogger(); 

    private final static Pattern FILTERS = Pattern.compile(".*(\\.(css|js|gif|jpg" + "|png|mp3|mp3|zip|gz))$"); 

    /** 
    * This method receives two parameters. The first parameter is the page in 
    * which we have discovered this new url and the second parameter is the new 
    * url. You should implement this function to specify whether the given url 
    * should be crawled or not (based on your crawling logic). In this example, 
    * we are instructing the crawler to ignore urls that have css, js, git, ... 
    * extensions and to only accept urls that start with 
    * "http://www.ics.uci.edu/". In this case, we didn't need the referringPage 
    * parameter to make the decision. 
    */ 
    @Override 
    public boolean shouldVisit(Page referringPage, WebURL url) { 
     logger.info("Quet trang."); 
     String href = url.getURL().toLowerCase(); 
     return !FILTERS.matcher(href).matches() && href.startsWith("http://www.ics.uci.edu/"); 
    } 

    /** 
    * This function is called when a page is fetched and ready to be processed 
    * by your program. 
    */ 
    @Override 
    public void visit(Page page) { 
     String url = page.getWebURL().getURL(); 
     System.out.println("URL: " + url); 

     if (page.getParseData() instanceof HtmlParseData) { 
      HtmlParseData htmlParseData = (HtmlParseData) page.getParseData(); 
      String text = htmlParseData.getText(); 
      String html = htmlParseData.getHtml(); 
      Set<WebURL> links = htmlParseData.getOutgoingUrls(); 

      logger.info("Quet trang."); 

      System.out.println("Text length: " + text.length()); 
      System.out.println("Html length: " + html.length()); 
      System.out.println("Number of outgoing links: " + links.size()); 
     } 
    } 
} 

log4j2.properties

log4j.rootLogger=DEBUG, stdout 

Yardım beni log4j işe yaramadı neden?

enter image description here

+0

Bu benim tam proje https://gitlab.com/Donhu/DCV_crawler_engine/tree/master –

cevap

0

Sen log4j-api ve log4j çekirdekli bağımlılık gerekir.

Şu anda yalnızca api bağımlılığınız var. sizin derleme bağımlılıkları için bu ekleyin:

'org.apache.logging.log4j:log4j-core:2.5' 
+0

olduğunu Teşekkür ederiz, yanıtın faydalı olduğunu, sonra ben olmuştur daha bağımlılık eklemek . Ama benim sorunum bu noktalardan daha karmaşık görünüyor. Tam kaynak kodum: https://gitlab.com/Donhu/DCV_crawler_engine/tree/master çünkü tarayıcı kütüphanesinde 'slf4j' var. –