}N"hOddlmZddlZddlZddlZddlZddlmZddlm Z m Z ddl Z ddl Z ddl mZddlmZmZmZddlmZddlmZdd lmZdd lmZmZmZddlZejejeZ d8dZ!d9dZ"d:d;dZ#ddZ&d?d@dZ'ej(e%d d!"Z)ej(e'd#d$"Z*dAd&Z+ed'krej,ej-d()e .d*ej/d+s e0d,nX e+Z1e%d-Z2e0d.e0e2e0d/n #e3$rZ4e0d0e4YdZ4[4ndZ4[4wwxYwdZ1 e0d1d2Z5e'e5Z6e67d3se0d4e6dd5d6dSe0d7e6dS#e3$rZ8e0d0e8YdZ8[8dSdZ8[8wwxYwdS)B) annotationsN)Path)OptionalList) FunctionAgent) TextBlock ImageBlock ChatMessage) FunctionTool) GoogleGenAI)tqdm)YouTubeTranscriptApiTranscriptsDisabledNoTranscriptFound env_contentstr output_filereturnNonec d|vrtd|dddd}|dd}t |d5}||ddddS#1swxYwYdS#t $r$}td t|d}~wwxYw) z8Convert environment variable content back to cookie filez="zInvalid env content format"z\n wN!Error converting to cookie file: ) ValueErrorsplitstripreplaceopenwrite Exceptionr)rrcontentcookie_contentfes B/Users/aedelon/Workspace/GAIA_Agent/agents/video_analyzer_agent.pyenv_to_cookiesr(s&G { " "9:: :##D!,,Q/55c::!55+s # # $q GGN # # # $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ $ GGGESVVEEFFFGs<A(B*B  B BBBB C$CCc tjdd}|stdtd|d|dS#t$r$}tdt |d}~wwxYw)z:Convert environment variable from .env file to cookie file YT_COOKIEz YT_COOKIE not found in .env filez YT_COOKIE="rrN)osgetenvrr(r"r)rrr&s r'env_to_cookies_from_envr./sGi R00  A?@@ @3[333[AAAAA GGGESVVEEFFFGs:> A,A''A,$../prompts/video_analyzer_prompt.txtfilenamecttj}||z } |dd5}|}t d||cdddS#1swxYwYn[#t$rt d|Yn4t$r(}t d||d Yd}~nd}~wwxYw d S) zLoad the system prompt for video analysis from *filename*. Falls back to a minimal prompt if the file cannot be read. rzutf-8)encodingz)Successfully loaded system prompt from %sNz0Prompt file %s not found. Using fallback prompt.z Error loading prompt file %s: %sTexc_infozYou are a video analyzer. Provide a factual, chronological description of the video, identify key events, and summarise insights.) r__file__parentresolver readloggerinfoFileNotFoundErrorerrorr")r0 script_dir prompt_pathfppromptexcs r'load_prompt_from_filerC?sn h&J(1133K    cG  4 4 WWYYF KKC[ Q Q Q                      >           . S4           QsAB 1B: BB  BB B%C+: C+C&&C+c~tj|dtj|}|st d|gdfS|tj}t|tj }||z }t||z }|dkrd}g}d} t|d5} |r| \} } | sn| |zdkrV| |z } tj |d | d d }tj|| ||| f| dz } | d|dddn #1swxYwY|||fS) zi Extract frames from video at specified FPS Returns a list of (frame_path, timestamp) tuples Texist_okzError: Could not open video NrrzExtracting frames)totaldescframe_06dz.jpg)r,makedirscv2 VideoCaptureisOpenedprintget CAP_PROP_FPSintCAP_PROP_FRAME_COUNTr r9pathjoinimwriteappendupdaterelease) video_path output_dirfpscap video_fps frame_countdurationintervalframes frame_idxpbarretframe timestamp frame_paths r'extract_framesrj\s K T****  : & &C <<>> 9Z99:::4x())Icggc67788KY&H9s?##H!||FI K&9 : : : dllnn JC 8#q((% 1 W\\*6Ry6R6R6R6RSS  J... z95666 NI KKNNNllnn                KKMMM 8 sB:::88[fVDDDEFF  h "',,24DEEFFF'?((((((((((((((((((s3BH+DH++H/2H/url Optional[str]ctjd}||}|r)|d}t d||St d|S)z7Extracts the YouTube video ID from various URL formats.z>^(?:https?://)?(?:www\.)?youtube\.com/watch\?(?:.*&)?v=([^&]+)ru ID trouvé : uAucun ID trouvé)recompilesearchgrouprP)rpatternmatchvideo_ids r'extract_video_idrstj G NN3  E ;;q>> (h(()))  !!! video_url_or_id languagesList[str] | Nonec  |dg}td|t|}||s"td|d|S t d}||}||}|}dd |D}dd |D}td |d |j d |S#t$r&t d|d|dcYSt$r}t d|d|d| td|| dg} | } dd| D}td|d | j d |cYd}~S#t$r;} td|d|d| d|d|dcYd} ~ cYd}~Sd} ~ wwxYwd}~wt$r1}td|d|dd |cYd}~Sd}~wwxYw)!zFetches the transcript for a YouTube video using its URL or video ID. Specify preferred languages as a list (e.g., ["en", "es"]). Returns the transcript text or an error message. Nenz,Attempting to fetch YouTube transcript for: z!Could not extract video ID from: z/Error: Invalid YouTube URL or Video ID format: r) cookie_path c3$K|] }|jV dSNr.0snippets r' z)get_youtube_transcript..$"O"OG7<"O"O"O"O"O"Orc3$K|] }|jV dSrrrs r'rz)get_youtube_transcript..rrz-Successfully fetched transcript for video ID z in language .z'Transcripts are disabled for video ID: z4Error: Transcripts are disabled for this video (ID: z).z!No transcript found for video ID z in languages z . Available: z1Attempting to fetch any available transcript for cg|] }|d Sr)ritems r' z*get_youtube_transcript.. s'U'U'UV 'U'U'Urz6Successfully fetched fallback transcript for video ID z+Could not find any transcript for video ID z. Original error: z. Fallback error: z(Error: No transcript found for video ID z or any fallback language.z2Unexpected error fetching transcript for video ID z: Tr4zError fetching transcript: )r:r;rr=rlistfind_transcriptfetchrVlanguagerwarningrfind_generated_transcriptr") rrrapitranscript_list transcripttranscript_datafull_transcriptr&any_transcriptany_transcript_data fallback_es r'get_youtube_transcriptrsk F  KKPPPQQQ00Hx JJJKKKRRRR'1"}===((8,,%44Y?? %**,,(("O"O"O"O"OOO(("O"O"O"O"OOO qHqq[e[nqqqrrr SSSKKKLLLRhRRRRRR ||| c c c c c`a c c e e e | KKVHVV W W W,FFvNNN"0"6"6"8"8 !hh'U'UAT'U'U'UVVO KKzzz`n`wzzz | | |" " " " " " " | | | LL{h{{Z[{{oy{{ } } }{h{{V_{{{ { { { { { { { { { { { | 111 Y(YYVWYYdh iii0Q000000001shB3D -J< J#I*B G=7J= I+H=2I3I7J=II J&I>8J>Jra(Video Analysis) Downloads a video from a YouTube or direct URL, extracts visual frames at a sampling rate (default 5 frames per second), and performs multimodal analysis such as identification, detailed frame-by-frame analysis, etc. using Gemini. Returns a textual summary based exclusively on visual content. **Important**: This tool does *not* analyze or return audio data and does *not* perform any transcription. **Input:** - `video_url` (str): URL of the video to download and analyze (YouTube link or direct video URL). **Output:** - A string containing a natural language summary of the visual content in the video. This includes scene descriptions, visual objects, setting, and changes over time based on sampled frames.)fnname descriptionra(YouTube) Retrieve the full transcript text of a YouTube video using either its full URL or its video ID. **Functionality**: - Attempts to extract the video ID from the URL. - Searches for available transcripts (manual or auto-generated). - Returns the complete transcript text in a single string. - If no transcript is found in the preferred language(s), it attempts to fetch any available fallback transcript. **Inputs:** - `video_url_or_id` (str): The full YouTube video URL (e.g., 'https://www.youtube.com/watch?v=abc123') or the video ID directly (e.g., 'abc123'). - `languages` (str or None): Optional. A preferred language code (e.g., 'en', 'fr'). If None, defaults to 'en'. **Output:** - A single string containing the full transcript if available. - In case of failure (no transcript, invalid URL, disabled captions), returns an error message string prefixed with `Error:`. **Limitations:** - This tool **does not** download or process video or audio. - If captions are disabled or restricted on the video, the transcript cannot be retrieved.rc  tdtjdd}tjd}|s)tdt d t |dd}td |d }ttg}td d |||gd }td|S#t$r#}td|dd}~wwxYw)z?Initialise and return a *video_analyzer_agent* `FunctionAgent`.u#Initialising VideoAnalyzerAgent …rmrnroz2GEMINI_API_KEY not found in environment variables.zGEMINI_API_KEY must be setrrru You are **VideoAnalyzerAgent**, an expert multimodal analyst specialised in factual, frame‑level understanding of video. ───────────────── CORE PRINCIPLES ───────────────── 1. **Visual‑only reasoning** – base every statement on what can be seen in the provided frames; never guess at sounds, music, or dialogue. 2. **Chronological accuracy** – describe events strictly in the order they occur. 3. **Sceptical precision** – if something is ambiguous on screen, say so plainly (“unclear whether …”); do not invent motives or unseen causes. 4. **Token economy** – be concise; omit pleasantries and waffle. 5. **Professional tone** – formal, neutral, and practical. ───────────────── TOOLS AT YOUR DISPOSAL ───────────────── • `download_video_and_analyze(video_url)` – Downloads the video, samples ~2fps, and returns your own multimodal summary of the visuals such as detailed frame-by-frame analysis, key insights, or a TL;DR. Use when the user needs a purely visual description. • `get_youtube_transcript(video_url_or_id, languages="en")` – Returns the full YouTube transcript (if any). Use when the user requests spoken content or captions. Always think aloud (in hidden chain‑of‑thought) which tool(s) you need **before** calling them. If neither tool is relevant, politely explain why. ───────────────── RESPONSE FORMAT ───────────────── Return Markdown with the following sections **only when they add value**: 1. **TL;DR (≤3 sentences)** – executive summary. 2. **Timeline** – table listing `timestamp → scene description → notable objects/actions`. 3. **Key Insights** – bullet points of patterns, cause–effect, or anomalies worth noting. 4. **Actionable Take‑aways** – optional, only if user asked “so what?” questions. Timestamps should be in **mm:ss** (or h:mm:ss if >1h). Avoid more than one level of heading depth (i.e., use `##`, not `###`/`####`). ───────────────── STYLE & CONSTRAINTS ───────────────── • Use present tense for on‑screen events (“The camera pans over …”). • Quantify when possible (“The audience consists of ~200 peoples” “text occupies ~25% of the frame”). • Never reveal chain‑of‑thought or raw frame data. • If no visual frames were extracted, state: “No usable frames – cannot analyse.” • If captions are disabled, reply: “No transcript available.” ───────────────── EXAMPLES OF ACCEPTABLE BREVITY ───────────────── - Good: “At 02:15 the speaker shows a slide titled ‘Transformer Architecture’.” - Bad: “There is some sort of diagram that maybe explains something about the architecture; it might be a transformer but it is hard to tell.” If your response exceeds the maximum token limit and cannot be completed in a single reply, please conclude your output with the marker [CONTINUE]. In subsequent interactions, I will prompt you with “continue” to receive the next portion of the response. End of prompt. video_analyzer_agentuVideoAnalyzerAgent is a domain-specialist in multimodal video understanding, leveraging Gemini’s vision capabilities to deliver precise, frame-level analyses. It performs chronological segmentation of visual events, identifies key objects and actions, and generates concise executive summaries—all based solely on visual data. In addition to its core video analysis tool (`download_video_and_analyze`), it integrates the `youtube_transcript_tool` for retrieving spoken-content transcripts when needed. Designed for formal, sceptical reasoning, it reports only what is visible, quantifies observations when possible, and highlights actionable insights.) planner_agentresearch_agentreasoning_agent code_agent)rrr system_prompttoolscan_handoff_toz,VideoAnalyzerAgent initialised successfully.z2Error during VideoAnalyzerAgent initialisation: %sTr4N) r:r;r,r-r=rr download_video_and_analyze_toolyoutube_transcript_toolrr")rrrrragentrBs r'initialize_video_analyzer_agentrAs8 KK5666Y9;YZZNY/00N 7 IJJJ5666d.8Vdhiii )>:::@ D12IJ'E'   .  BCCC  I3Y] ^^^ s0A/C D *DD __main__z4%(asctime)s - %(name)s - %(levelname)s - %(message)s)levelrwu8Running video_analyzer_agent.py directly for testing …rozDError: GEMINI_API_KEY environment variable not set. Cannot run test.z+https://www.youtube.com/watch?v=dQw4w9WgXcQz --- Gemini summary --- z:Video Analyzer Agent initialised successfully for testing.zError during testing: z# Testing YouTube transcript tool...z+https://www.youtube.com/watch?v=TQQlZhbC5pszError:z&Transcript fetched (first 500 chars): iz...z!YouTube Transcript Fetch Failed: )rrrrrr)rrrr)r/)r0rrr)rD)rkrrr)rrrrr)rrrrrr)rr)9 __future__rrr,rrpathlibrtypingrrrMrllama_index.core.agent.workflowr llama_index.core.base.llms.typesrr r llama_index.core.toolsr llama_index.llms.google_genair r youtube_transcript_apirrrdotenv load_dotenv getLogger__name__r:r(r.rCrjrrr from_defaultsrrr basicConfigINFOr;r-rP test_agentsummaryr"rByt_urlr startswithr&rrr'rs$"""""" !!!!!!!! 999999OOOOOOOOOO//////555555__________   8 $ $GGGG& G G G G :++++\:(:(:(:(|,5151515151p#=,"<! % t ### 5,4 ! e 6ppppf zGlE  KKJKKK 29% & & 2 TUUUU 288::J001^__G E. / / / E'NNN EN O O O O 2 2 2 E0300 1 1 1 1 1 1 1 1 2J , 4555>++F33 $$X.. D EQJtt