2011년 3월 28일 월요일

조선일보 Weeklybiz 용 룰파일 만들어봤습니다.

Patrick Kim님의 요청으로 간만에 만들어봤습니다.
나름대로 주석을 달아봤습니다.
이해가 안되시면 다시 질문주세요.

<?xml version="1.0" encoding="UTF-8"?>
<PUMzRule Version="2.0" Type="News" Base="">
<Properties>
<Author>Jinwoo Min</Author>
<License>http://creativecommons.org/licenses/by-nc-sa/2.0/kr/</License>
<Version>1.0.1.1</Version>
<ContentsLicense>www.chosun.com</ContentsLicense>
</Properties>
<Site Title="조선일보 (Weekly Biz)" Url="http://www.chosun.com/" Encoding="euc-kr"/> 

<Task Id="Grab" RootContentHandlerId="Categorized">  
<ContentHandler Id="Categorized" ChildContentHandlerId="Paged" Type="ItemList">
<Properties>
<!-- 
Source URL: http://news.chosun.com/weeklybiz/svc/list_in/list.html?catid=1N&pn=2
URL format: http://news.chosun.com/weeklybiz/svc/list_in/list.html?catid={CategoryId}&pn=0 and CategoryID=1N
URL format XML: <![CDATA[http://news.chosun.com/weeklybiz/svc/list_in/list.html?catid={CategoryId}&pn=0]]>
-->
<UrlFormat><![CDATA[http://news.chosun.com/weeklybiz/svc/list_in/list.html?catid={CategoryId}&pn=0]]></UrlFormat>
                                <ItemCount>1</ItemCount>  
<!--
CategoryId=1N from Source URL 
-->
<Item.1>1N</Item.1> 
<Item.1.Title>Weekly Biz</Item.1.Title>
<!--
<Item.1>1</Item.1> 
<Item.1.Title>경제</Item.1.Title>
<Item.2>2</Item.2>
<Item.2.Title>정치</Item.2.Title>
<Item.3>3</Item.3>
<Item.3.Title>사회</Item.3.Title>
<Item.4>4</Item.4>
<Item.4.Title>국제</Item.4.Title>
<Item.5>5</Item.5>
<Item.5.Title>문화</Item.5.Title>
<Item.6>6</Item.6>
<Item.6.Title>오피니언</Item.6.Title>
-->
<ItemTo>CategoryId</ItemTo> 
</Properties> 
</ContentHandler>

<ContentHandler Id="Paged" ChildContentHandlerId="Listed" Type="NumberRange">
<Properties>
<!--
Source URL: http://news.chosun.com/weeklybiz/svc/list_in/list.html?catid=1N&pn=2
URL format:http://news.chosun.com/weeklybiz/svc/list_in/list.html?catid=${CategoryId}&pn=${PageNumber}
URL format XML: <![CDATA[http://news.chosun.com/weeklybiz/svc/list_in/list.html?catid=${CategoryId}&pn=${PageNumber}]]>
-->
<UrlFormat><![CDATA[http://news.chosun.com/weeklybiz/svc/list_in/list.html?catid=${CategoryId}&pn=${PageNumber}]]></UrlFormat>
<!-- 
StartNumber: starts with 1(not 0)
-->
<StartNumber>1</StartNumber> 
<EndNumber>2</EndNumber> 
<NumberTo>PageNumber</NumberTo>
</Properties>
</ContentHandler> 

<ContentHandler Id="Listed" ChildContentHandlerId="Article" Type="Regex">
<Properties>
<!--
Source URL: view-source:http://news.chosun.com/weeklybiz/svc/list_in/list.html?catid=1N&pn=2
Source HTML:
...
<div id="list">

<div id="list_photo"><a href="/site/data/html_dir/2010/04/30/2010043001667.html"><img src="http://image.chosun.com/sitedata/thumbnail/201004/30/2010043001456_0_thumb.jpg"></a></div>

<dl>

<dt class="list_tit"><a href="/site/data/html_dir/2010/04/30/2010043001667.html">[Weekly BIZ] "한강 좋아지고 영어도 날로 잘 통해…서울, 쇼핑하며 걷기엔 아직은 불편"</a>

<span class="author_txt">정철환 기자</span>

<span class="author_icon"><a href="http://search.chosun.com/search/newsSearch.jsp?detailFlag=1&searchTermWriter=정철환 기자"><img src="http://image.chosun.com/cs/200708/images/author_icon.gif" onmouseover="document.getElementById('author_2010043001667').style.display='block'" onmouseout="document.getElementById('author_2010043001667').style.display='none'"><span class="author_art_view" id="author_2010043001667" style="display:none">이 기자의 다른기사 보기</span></a></span>
...

<dt class="list_tit"><a href="/site/data/html_dir/2010/04/30/2010043001669.html">[Weekly BIZ] [18人 글로벌 드림팀의 &#39;경영 상담소&#39;]  Q: 가정용 전자 제품을 해외에 내놓으려는데 소비자 뇌리에 박히는 디자인 개발 힘들어요</a>

<span class="list_date">[10/05/01 03:03]</span> </dt>
...


Pattern:
<dt class="[^"]*"><a href="([^"]*)">([^<]*)</a>
Pattern Test: http://www.regexplanet.com/simple/index.html

Pattern XML Encoding: <![CDATA[<dt class="[^"]*"><a href="([^"]*)">([^<]*)</a>]]>

-->
<UrlPattern><![CDATA[<dt class="[^"]*"><a href="([^"]*)">([^<]*)</a>]]></UrlPattern> 
 <!--
Pattern index 1=URL
Pattern index 2=Title                  
-->
<TitleIndex>2</TitleIndex>
<UrlFactorIndex>1</UrlFactorIndex>
<!--
Url = "http://news.chosun.com" + ${UrlFactor}
-->
<UrlFormat>http://news.chosun.com${UrlFactor}</UrlFormat>
<IndexTo>Index</IndexTo>
</Properties>
</ContentHandler>

<ContentHandler Id="Article" Type="Normal"> 
<ContentFilters>
<!--
Source URL: view-source:http://news.chosun.com/site/data/html_dir/2010/04/30/2010043001667.html
Source HTML: 
....
</div>

< !- - article - ->

<div id="article" class="article">




  <h3>세계의 도시 리뷰하는 英 '모노클'誌 편집장이 바라본  SEOUL<br></h3> 
...

</div>

< !- - google_ad_section_end - ->

< !- - article - ->



< !- - art_wrap - ->

<div class="art_wrap">
...

-->

<!--
Crop content filter:
Start tag: < !- - article - ->
Start tag XML: &lt;!- - article - -&gt;
End tag: < !- - google_ad_section_end - ->
End tag Xml: &lt;!- - google_ad_section_end - -&gt;
IncludeTags: False (menas not include Start and End tags.)

XML string conversion: http://coderstoolbox.net/string/
-->
<Crop StartTag="&lt;!-- article --&gt;" EndTag="&lt;!-- google_ad_section_end --&gt;" IncludeTags="False" />

<!---
Remove Tags content filter
RemoveScripts: True (Remove all html tags except raw texts)
ExcludeTagsPattern: These tags will not be removed.
-->
         <RemoveTags RemoveScripts="True" ExcludeTagsPattern="&lt;img|&lt;br|&lt;/p|&lt;p|&lt;dl|&lt;dd|&lt;/dl|
&lt;/dd|&lt;table|&lt;/table|&lt;tr|&lt;/tr|&lt;td|&lt;/td"/>

<!--
GrabImage content filter
-->
<GrabImage UrlPattern="&lt;img[ a-z0-9&quot;\(\):=._]*src=&quot;([^&quot;]*)&quot;" UrlFormat="${UrlFactor}" 
ReplaceFormat="&lt;img pumz src=&quot;${FileName}&quot;"/>  
<Trim/>


</ContentFilters>
</ContentHandler> 
</Task> 

<Task Id="Output" RootContentHandlerId="Categorized"> 
<ContentHandler Id="Categorized" ChildContentHandlerId="Paged" Type="Normal">
<ContentFilters>
<ToFile FileNameFormat="index.html" Template="categorized-index.st"/>
</ContentFilters>
</ContentHandler>

<ContentHandler Id="Paged" ChildContentHandlerId="Listed" Type="Normal">
<ContentFilters>
<ToFile FileNameFormat="c${CategoryId}-index.html" Template="paged-index.st"/>
</ContentFilters>
</ContentHandler>

<ContentHandler Id="Listed" ChildContentHandlerId="Article" Type="Normal">
<ContentFilters>
<ToFile FileNameFormat="c${CategoryId}-p${PageNumber}-index.html" Template="listed-index.st"/>
</ContentFilters>
</ContentHandler>

<ContentHandler Id="Article" Type="Normal">
<ContentFilters>
<ToFile FileNameFormat="c${CategoryId}-p${PageNumber}-a${Index}.html" Template="article.st"/>
</ContentFilters>
</ContentHandler>
</Task>

</PUMzRule>