
    .hh%                         S r SSKrSSKrSSKrSSKrSSKrS/r\R                  " SS5      r	S r
S r " S S5      r " S	 S
5      r " S S5      rg)a  robotparser.py

Copyright (C) 2000  Bastian Kleineidam

You can choose between two licenses when using this package:
1) GNU GPLv2
2) PSF license for Python 2.2

The robots.txt Exclusion Protocol is implemented as specified in
http://www.robotstxt.org/norobots-rfc.txt
    NRobotFileParserRequestRatezrequests secondsc                 z    [         R                  R                  U SS9n[         R                  R                  USS9$ )Nsurrogateescape)errors)urllibparseunquotequote)pathunquoteds     ;/opt/python-3.13.8/usr/lib/python3.13/urllib/robotparser.py	normalizer      s7    ||##D1B#CH<<h/@AA    c                     U R                  S5      u  pn[        U 5      n U(       a!  [        R                  " SS U5      nU SU-   -  n U $ )N?z[^=&]+c                     [        U S   5      $ )Nr   )r   )ms    r   <lambda> normalize_path.<locals>.<lambda>    s    IadOr   )	partitionr   resub)r   sepquerys      r   normalize_pathr      sG    ~~c*DuT?D
y";UCeKr   c                   d    \ rS rSrSrSS jrS rS rS rS r	S r
S	 rS
 rS rS rS rS rSrg)r   %   zjThis class provides a set of methods to read, parse and answer
questions about a single robots.txt file.

c                 z    / U l         / U l        S U l        SU l        SU l        U R                  U5        SU l        g )NFr   )entriessitemapsdefault_entrydisallow_all	allow_allset_urllast_checkedselfurls     r   __init__RobotFileParser.__init__+   s;    !!Sr   c                     U R                   $ )zReturns the time the robots.txt file was last fetched.

This is useful for long-running web spiders that need to
check for new robots.txt files periodically.

)r&   r(   s    r   mtimeRobotFileParser.mtime4   s        r   c                 6    SSK nUR                  5       U l        g)zISets the time the robots.txt file was last fetched to the
current time.

r   N)timer&   )r(   r1   s     r   modifiedRobotFileParser.modified=   s    
 	 IIKr   c                 n    Xl         [        R                  R                  U5      SS u  U l        U l        g)z,Sets the URL referring to a robots.txt file.      N)r)   r   r	   urlsplithostr   r'   s     r   r%   RobotFileParser.set_urlE   s+    %||44S9!A>	49r   c                     [         R                  R                  U R                  5      nUR	                  5       nU R                  UR                  SS5      R                  5       5        g! [         R                  R                   aY  nUR                  S;   a  SU l        O'UR                  S:  a  UR                  S:  a  SU l        UR                  5          SnAgSnAff = f)z4Reads the robots.txt URL and feeds it to the parser.zutf-8r   )i  i  Ti  i  N)r   requesturlopenr)   readr	   decode
splitlineserror	HTTPErrorcoder#   r$   close)r(   frawerrs       r   r=   RobotFileParser.readJ   s    
	L&&txx0A &&(CJJszz'+<=HHJK ||%% 	xx:%$(!SSXX^!%IIKK	s   )A+ +C"	ACC"c                     SUR                   ;   a  U R                  c  Xl        g g U R                  R                  U5        g N*)
useragentsr"   r    append)r(   entrys     r   
_add_entryRobotFileParser._add_entryX   s;    %"""!!)%*" * LL&r   c                    Sn[        5       nU R                  5         U GH  nU(       d6  US:X  a  [        5       nSnO#US:X  a  U R                  U5        [        5       nSnUR                  S5      nUS:  a  USU nUR	                  5       nU(       d  Mv  UR                  SS5      n[        U5      S:X  d  M  US   R	                  5       R                  5       US'   US   R	                  5       US'   US   S:X  aD  US:X  a  U R                  U5        [        5       nUR                  R                  US   5        SnGM   US   S:X  a6  US:w  a-  UR                  R                  [        US   S	5      5        SnGM\  GM_  US   S
:X  a6  US:w  a-  UR                  R                  [        US   S5      5        SnGM  GM  US   S:X  aG  US:w  a>  US   R	                  5       R                  5       (       a  [        US   5      Ul        SnGM  GM  US   S:X  a  US:w  a  US   R                  S5      n[        U5      S:X  au  US   R	                  5       R                  5       (       aO  US   R	                  5       R                  5       (       a)  [        [        US   5      [        US   5      5      Ul        SnGM  GM  US   S:X  d  GM  U R"                  R                  US   5        GM     US:X  a  U R                  U5        gg)z|Parse the input lines from a robots.txt file.

We allow that a user-agent: line is not preceded by
one or more blank lines.
r   r5      #N:z
user-agentdisallowFallowTzcrawl-delayzrequest-rate/sitemap)Entryr2   rN   findstripsplitlenlowerrK   rL   	rulelinesRuleLineisdigitintdelayr   req_rater!   )r(   linesstaterM   lineinumberss          r   r	   RobotFileParser.parsea   s    DA:!GEEaZOOE*!GEE		#AAvBQx::<D::c1%D4yA~q'--///1Qq'--/Q7l*z. %$$++DG4E!W
*z..xQ/GH ! " !W'z..xQ/FG ! " !W-z  7==?2244*-d1g,EK ! " !W.z"&q'--"4LA-'!*2B2B2D2L2L2N2N '
 0 0 2 : : < <-8WQZ#gVWj/-ZEN ! " !W	)
 MM((a1o p A:OOE" r   c                    U R                   (       a  gU R                  (       a  gU R                  (       d  g[        R                  R                  U5      n[        R                  R                  SS/USS Q75      n[        U5      nU(       d  SnU R                   H,  nUR                  U5      (       d  M  UR                  U5      s  $    U R                  (       a  U R                  R                  U5      $ g)z=using the parsed robots.txt decide if useragent can fetch urlFT rQ   NrV   )r#   r$   r&   r   r	   r7   
urlunsplitr   r    
applies_to	allowancer"   )r(   	useragentr)   
parsed_urlrM   s        r   	can_fetchRobotFileParser.can_fetch   s    >>
    \\**3/
ll%%r2&?
12&?@S!C\\E	**s++ " %%//44r   c                     U R                  5       (       d  g U R                   H'  nUR                  U5      (       d  M  UR                  s  $    U R                  (       a  U R                  R                  $ g N)r.   r    rm   rb   r"   r(   ro   rM   s      r   crawl_delayRobotFileParser.crawl_delay   sY    zz||\\E	**{{" " %%+++r   c                     U R                  5       (       d  g U R                   H'  nUR                  U5      (       d  M  UR                  s  $    U R                  (       a  U R                  R                  $ g rt   )r.   r    rm   rc   r"   ru   s      r   request_rateRobotFileParser.request_rate   sY    zz||\\E	**~~% " %%...r   c                 >    U R                   (       d  g U R                   $ rt   )r!   r-   s    r   	site_mapsRobotFileParser.site_maps   s    }}}}r   c                     U R                   nU R                  b  XR                  /-   nSR                  [        [        U5      5      $ )Nz

)r    r"   joinmapstr)r(   r    s     r   __str__RobotFileParser.__str__   s>    ,,)!3!3 44G{{3sG,--r   )	r$   r"   r#   r    r8   r&   r   r!   r)   N)rk   )__name__
__module____qualname____firstlineno____doc__r*   r.   r2   r%   r=   rN   r	   rq   rv   ry   r|   r   __static_attributes__ r   r   r   r   %   sF    
!(?
L'G#R8
.r   c                   *    \ rS rSrSrS rS rS rSrg)r_      zhA rule line is a single "Allow:" (allowance==True) or "Disallow:"
(allowance==False) followed by a path.c                 N    US:X  a	  U(       d  Sn[        U5      U l        X l        g )Nrk   T)r   r   rn   )r(   r   rn   s      r   r*   RuleLine.__init__   s!    2:iI"4(	"r   c                 d    U R                   S:H  =(       d    UR                  U R                   5      $ rI   )r   
startswith)r(   filenames     r   rm   RuleLine.applies_to   s%    yyCA8#6#6tyy#AAr   c                 L    U R                   (       a  SOSS-   U R                  -   $ )NAllowDisallowz: rn   r   r-   s    r   r   RuleLine.__str__   s    >>zTADIIMMr   r   N)	r   r   r   r   r   r*   rm   r   r   r   r   r   r_   r_      s    1#BNr   r_   c                   0    \ rS rSrSrS rS rS rS rSr	g)	rX      z?An entry has one or more user-agents and zero or more rulelinesc                 <    / U l         / U l        S U l        S U l        g rt   )rK   r^   rb   rc   r-   s    r   r*   Entry.__init__   s    
r   c                    / nU R                    H  nUR                  SU 35        M     U R                  b  UR                  SU R                   35        U R                  b7  U R                  nUR                  SUR                   SUR
                   35        UR                  [        [        U R                  5      5        SR                  U5      $ )NzUser-agent: zCrawl-delay: zRequest-rate: rV   
)rK   rL   rb   rc   requestssecondsextendr   r   r^   r   )r(   retagentrates       r   r   Entry.__str__   s    __EJJeW-. %::!JJtzzl34==$==DJJa~FG

3sDNN+,yy~r   c                     UR                  S5      S   R                  5       nU R                   H"  nUS:X  a    gUR                  5       nX!;   d  M"    g   g)z2check if this entry applies to the specified agentrV   r   rJ   TF)r[   r]   rK   )r(   ro   r   s      r   rm   Entry.applies_to
  sQ     OOC(+113	__E|KKME! % r   c                 r    U R                    H'  nUR                  U5      (       d  M  UR                  s  $    g)zJPreconditions:
- our agent applies to this entry
- filename is URL encodedT)r^   rm   rn   )r(   r   rf   s      r   rn   Entry.allowance  s0     NNDx((~~% # r   )rb   rc   r^   rK   N)
r   r   r   r   r   r*   r   rm   rn   r   r   r   r   rX   rX      s    I
r   rX   )r   collectionsr   urllib.errorr   urllib.parseurllib.request__all__
namedtupler   r   r   r   r_   rX   r   r   r   <module>r      sd   
  	   
$$]4FGB~. ~.@N N"( (r   